[patch AArch64] Do not perform a vector splat for vector initialisation if it is...

author James Greenhalgh <james.greenhalgh@arm.com>

Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)

committer James Greenhalgh <jgreenhalgh@gcc.gnu.org>

Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)
author James Greenhalgh <james.greenhalgh@arm.com>
Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)
committer James Greenhalgh <jgreenhalgh@gcc.gnu.org>
Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index c6dc455f319b7b2e39e9d244c3ec2c0699b6aebb..d3c4063a5ce784992492f7b2ac7e3f6ad7afb44b 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * config/aarch64/aarch64.c (aarch64_expand_vector_init): Modify code
+       generation for cases where splatting a value is not useful.
+       * simplify-rtx.c (simplify_ternary_operation): Simplify vec_merge
+       across a vec_duplicate and a paradoxical subreg forming a vector
+       mode to a vec_concat.
+
  2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
  
         * combine.c (simplify_set): Do not transform subregs to zero_extends
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index fc27b4073dd88402ab6e7ab05f6cafacca256494..1da313f57e0eed4df36dbd15aecbae9fd73f7388 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12107,9 +12107,51 @@ aarch64_expand_vector_init (rtx target, rtx vals)
             maxv = matches[i][1];
           }
  
-      /* Create a duplicate of the most common element.  */
-      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
-      aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+      /* Create a duplicate of the most common element, unless all elements
+        are equally useless to us, in which case just immediately set the
+        vector register using the first element.  */
+
+      if (maxv == 1)
+       {
+         /* For vectors of two 64-bit elements, we can do even better.  */
+         if (n_elts == 2
+             && (inner_mode == E_DImode
+                 || inner_mode == E_DFmode))
+
+           {
+             rtx x0 = XVECEXP (vals, 0, 0);
+             rtx x1 = XVECEXP (vals, 0, 1);
+             /* Combine can pick up this case, but handling it directly
+                here leaves clearer RTL.
+
+                This is load_pair_lanes<mode>, and also gives us a clean-up
+                for store_pair_lanes<mode>.  */
+             if (memory_operand (x0, inner_mode)
+                 && memory_operand (x1, inner_mode)
+                 && !STRICT_ALIGNMENT
+                 && rtx_equal_p (XEXP (x1, 0),
+                                 plus_constant (Pmode,
+                                                XEXP (x0, 0),
+                                                GET_MODE_SIZE (inner_mode))))
+               {
+                 rtx t;
+                 if (inner_mode == DFmode)
+                   t = gen_load_pair_lanesdf (target, x0, x1);
+                 else
+                   t = gen_load_pair_lanesdi (target, x0, x1);
+                 emit_insn (t);
+                 return;
+               }
+           }
+         rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+         aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
+         maxelement = 0;
+       }
+      else
+       {
+         rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+         aarch64_emit_move (target, gen_vec_duplicate (mode, x));
+       }
  
        /* Insert the rest.  */
        for (int i = 0; i < n_elts; i++)
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c

index 4f9796c7c84bede12550c8492a38bf502ba4248f..6b163f9169960dd09de73bcc780e096487fc1eab 100644 (file)
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -5860,6 +5860,57 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode,
                 return simplify_gen_binary (VEC_CONCAT, mode, newop0, newop1);
             }
  
+         /* Replace:
+
+             (vec_merge:outer (vec_duplicate:outer x:inner)
+                              (subreg:outer y:inner 0)
+                              (const_int N))
+
+            with (vec_concat:outer x:inner y:inner) if N == 1,
+            or (vec_concat:outer y:inner x:inner) if N == 2.
+            We assume that degenrate cases (N == 0 or N == 3), which
+            represent taking all elements from either input, are handled
+            elsewhere.
+
+            Implicitly, this means we have a paradoxical subreg, but such
+            a check is cheap, so make it anyway.
+
+            Only applies for vectors of two elements.  */
+
+         if ((GET_CODE (op0) == VEC_DUPLICATE
+              || GET_CODE (op1) == VEC_DUPLICATE)
+             && GET_MODE (op0) == GET_MODE (op1)
+             && GET_MODE_NUNITS (GET_MODE (op0)) == 2
+             && GET_MODE_NUNITS (GET_MODE (op1)) == 2
+             && IN_RANGE (sel, 1, 2))
+           {
+             rtx newop0 = op0, newop1 = op1;
+
+             /* Canonicalize locally such that the VEC_DUPLICATE is always
+                the first operand.  */
+             if (GET_CODE (newop1) == VEC_DUPLICATE)
+               {
+                 std::swap (newop0, newop1);
+                 /* If we swap the operand order, we also need to swap
+                    the selector mask.  */
+                 sel = sel == 1 ? 2 : 1;
+               }
+
+             if (GET_CODE (newop1) == SUBREG
+                 && paradoxical_subreg_p (newop1)
+                 && subreg_lowpart_p (newop1)
+                 && GET_MODE (SUBREG_REG (newop1))
+                     == GET_MODE (XEXP (newop0, 0)))
+               {
+                 newop0 = XEXP (newop0, 0);
+                 newop1 = SUBREG_REG (newop1);
+                 if (sel == 2)
+                   std::swap (newop0, newop1);
+                 return simplify_gen_binary (VEC_CONCAT, mode,
+                                             newop0, newop1);
+               }
+           }
+
           /* Replace (vec_merge (vec_duplicate x) (vec_duplicate y)
                                  (const_int n))
              with (vec_concat x y) or (vec_concat y x) depending on value
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index a60791172e8629924b221a8b0bb78db2be69ae55..7171973d9699180c5b29326fd010dbd63a1b8bf9 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2017-12-21  James Greenhalgh  <james.greenhalgh@arm.com>
+
+       * gcc.target/aarch64/vect-slp-dup.c: New.
+
  2017-12-21  Eric Botcazou  <ebotcazou@adacore.com>
  
         * c-c++-common/pr82872.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c

new file mode 100644 (file)

index 0000000..0541e48
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+void bar (double);
+
+void
+foo (double *restrict in, double *restrict in2,
+     double *restrict out1, double *restrict out2)
+{
+  for (int i = 0; i < 1024; i++)
+    {
+      out1[i] = in[i] + 2.0 * in[i+128];
+      out1[i+1] = in[i+1] + 2.0 * in2[i];
+      bar (in[i]);
+    }
+}
+
+/* { dg-final { scan-assembler-not "dup\tv\[0-9\]+.2d, v\[0-9\]+" } } */
+
author	James Greenhalgh <james.greenhalgh@arm.com>
	Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)
committer	James Greenhalgh <jgreenhalgh@gcc.gnu.org>
	Thu, 21 Dec 2017 16:39:43 +0000 (16:39 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/simplify-rtx.c		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/vect-slp-dup.c	[new file with mode: 0644]	patch \| blob