From 41dab855dce20d5d7042c9330dd8124d0ece19c0 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu, 6 Jun 2019 13:59:07 +0000
Subject: [PATCH] [AArch64] PR tree-optimization/90332: Implement
 vec_init<M><N> where N is a vector mode

This patch fixes the failing gcc.dg/vect/slp-reduc-sad-2.c testcase on aarch64
by implementing a vec_init optab that can handle two half-width vectors producing a full-width one
by concatenating them.

In the gcc.dg/vect/slp-reduc-sad-2.c case it's a V8QI reg concatenated with a V8QI const_vector of zeroes.
This can be implemented efficiently using the aarch64_combinez pattern that just loads a D-register to make
use of the implicit zero-extending semantics of that load.
Otherwise it concatenates the two vector using aarch64_simd_combine.

With this patch I'm seeing the effect from richi's original patch that added gcc.dg/vect/slp-reduc-sad-2.c on aarch64
and 525.x264_r improves by about 1.5%.

	PR tree-optimization/90332
	* config/aarch64/aarch64.c (aarch64_expand_vector_init):
	Handle VALS containing two vectors.
	* config/aarch64/aarch64-simd.md (*aarch64_combinez<mode>): Rename
	to...
	(@aarch64_combinez<mode>): ... This.
	(*aarch64_combinez_be<mode>): Rename to...
	(@aarch64_combinez_be<mode>): ... This.
	(vec_init<mode><Vhalf>): New define_expand.
	* config/aarch64/iterators.md (Vhalf): Handle V8HF.

From-SVN: r272002
---
 gcc/ChangeLog                      | 13 ++++++++++
 gcc/config/aarch64/aarch64-simd.md | 13 ++++++++--
 gcc/config/aarch64/aarch64.c       | 39 ++++++++++++++++++++++++++++++
 gcc/config/aarch64/iterators.md    |  1 +
 4 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 41e0c17f33c..ae15b05c65f 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,16 @@
+2019-06-06  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+	PR tree-optimization/90332
+	* config/aarch64/aarch64.c (aarch64_expand_vector_init):
+	Handle VALS containing two vectors.
+	* config/aarch64/aarch64-simd.md (*aarch64_combinez<mode>): Rename
+	to...
+	(@aarch64_combinez<mode>): ... This.
+	(*aarch64_combinez_be<mode>): Rename to...
+	(@aarch64_combinez_be<mode>): ... This.
+	(vec_init<mode><Vhalf>): New define_expand.
+	* config/aarch64/iterators.md (Vhalf): Handle V8HF.
+
 2019-06-06  Jozef Lawrynowicz  <jozef.l@mittosystems.com>
 
 	* config/msp430/msp430.md (ashlhi3): Use the const_variant of shift
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index b648e9e7916..df8bf1d9778 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3216,7 +3216,7 @@
 ;; In this insn, operand 1 should be low, and operand 2 the high part of the
 ;; dest vector.
 
-(define_insn "*aarch64_combinez<mode>"
+(define_insn "@aarch64_combinez<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
 	(vec_concat:<VDBL>
 	  (match_operand:VDC 1 "general_operand" "w,?r,m")
@@ -3230,7 +3230,7 @@
    (set_attr "arch" "simd,fp,simd")]
 )
 
-(define_insn "*aarch64_combinez_be<mode>"
+(define_insn "@aarch64_combinez_be<mode>"
   [(set (match_operand:<VDBL> 0 "register_operand" "=w,w,w")
         (vec_concat:<VDBL>
 	  (match_operand:VDC 2 "aarch64_simd_or_scalar_imm_zero")
@@ -5959,6 +5959,15 @@
   DONE;
 })
 
+(define_expand "vec_init<mode><Vhalf>"
+  [(match_operand:VQ_NO2E 0 "register_operand" "")
+   (match_operand 1 "" "")]
+  "TARGET_SIMD"
+{
+  aarch64_expand_vector_init (operands[0], operands[1]);
+  DONE;
+})
+
 (define_insn "*aarch64_simd_ld1r<mode>"
   [(set (match_operand:VALL_F16 0 "register_operand" "=w")
 	(vec_duplicate:VALL_F16
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b3c42e210c5..9a035dd9ed8 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -15106,6 +15106,45 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* This is a special vec_init<M><N> where N is not an element mode but a
+     vector mode with half the elements of M.  We expect to find two entries
+     of mode N in VALS and we must put their concatentation into TARGET.  */
+  if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
+    {
+      gcc_assert (known_eq (GET_MODE_SIZE (mode),
+		  2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
+      rtx lo = XVECEXP (vals, 0, 0);
+      rtx hi = XVECEXP (vals, 0, 1);
+      machine_mode narrow_mode = GET_MODE (lo);
+      gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
+      gcc_assert (narrow_mode == GET_MODE (hi));
+
+      /* When we want to concatenate a half-width vector with zeroes we can
+	 use the aarch64_combinez[_be] patterns.  Just make sure that the
+	 zeroes are in the right half.  */
+      if (BYTES_BIG_ENDIAN
+	  && aarch64_simd_imm_zero (lo, narrow_mode)
+	  && general_operand (hi, narrow_mode))
+	emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
+      else if (!BYTES_BIG_ENDIAN
+	       && aarch64_simd_imm_zero (hi, narrow_mode)
+	       && general_operand (lo, narrow_mode))
+	emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
+      else
+	{
+	  /* Else create the two half-width registers and combine them.  */
+	  if (!REG_P (lo))
+	    lo = force_reg (GET_MODE (lo), lo);
+	  if (!REG_P (hi))
+	    hi = force_reg (GET_MODE (hi), hi);
+
+	  if (BYTES_BIG_ENDIAN)
+	    std::swap (lo, hi);
+	  emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
+	}
+     return;
+   }
+
   /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index d0070b1a732..2179e6f21db 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -768,6 +768,7 @@
 ;; Half modes of all vector modes, in lower-case.
 (define_mode_attr Vhalf [(V8QI "v4qi")  (V16QI "v8qi")
 			 (V4HI "v2hi")  (V8HI  "v4hi")
+			 (V8HF  "v4hf")
 			 (V2SI "si")    (V4SI  "v2si")
 			 (V2DI "di")    (V2SF  "sf")
 			 (V4SF "v2sf")  (V2DF  "df")])
-- 
2.30.2