From: Richard Sandiford <richard.sandiford@linaro.org>
Date: Thu, 1 Feb 2018 11:04:28 +0000 (+0000)
Subject: [AArch64] Handle SVE subregs that are effectively REVs
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=002092be4027b1fd667178e9bbe99fa6ecee2e10;p=gcc.git

[AArch64] Handle SVE subregs that are effectively REVs

Subreg reads should be equivalent to storing the inner register to
memory and loading the appropriate memory bytes back, with subreg
writes doing the reverse.  For the reasons explained in the comments,
this isn't what happens for big-endian SVE if we simply reinterpret
one vector register as having a different element size, so the
conceptual store and load is needed in the general case.

However, that obviously produces poor code if we do it too often.
The patch therefore adds a pattern for handling the operation in
registers.  This copes with the important case of a VIEW_CONVERT
created by tree-vect-slp.c:duplicate_and_interleave.

It might make sense to tighten the predicates in aarch64-sve.md so
that such subregs are not allowed as operands to most instructions,
but that's future work.

This fixes the sve/slp_*.c tests on aarch64_be.

2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* config/aarch64/aarch64-protos.h (aarch64_split_sve_subreg_move)
	(aarch64_maybe_expand_sve_subreg_move): Declare.
	* config/aarch64/aarch64.md (UNSPEC_REV_SUBREG): New unspec.
	* config/aarch64/predicates.md (aarch64_any_register_operand): New
	predicate.
	* config/aarch64/aarch64-sve.md (mov<mode>): Optimize subreg moves
	that are semantically a reverse operation.
	(*aarch64_sve_mov<mode>_subreg_be): New pattern.
	* config/aarch64/aarch64.c (aarch64_maybe_expand_sve_subreg_move):
	(aarch64_replace_reg_mode, aarch64_split_sve_subreg_move): New
	functions.
	(aarch64_can_change_mode_class): For big-endian, forbid changes
	between two SVE modes if they have different element sizes.

Reviewed-by: James Greenhalgh <james.greenhalgh@arm.com>

From-SVN: r257289
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index db763a63648..b655476aaac 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,19 @@
+2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
+
+	* config/aarch64/aarch64-protos.h (aarch64_split_sve_subreg_move)
+	(aarch64_maybe_expand_sve_subreg_move): Declare.
+	* config/aarch64/aarch64.md (UNSPEC_REV_SUBREG): New unspec.
+	* config/aarch64/predicates.md (aarch64_any_register_operand): New
+	predicate.
+	* config/aarch64/aarch64-sve.md (mov<mode>): Optimize subreg moves
+	that are semantically a reverse operation.
+	(*aarch64_sve_mov<mode>_subreg_be): New pattern.
+	* config/aarch64/aarch64.c (aarch64_maybe_expand_sve_subreg_move):
+	(aarch64_replace_reg_mode, aarch64_split_sve_subreg_move): New
+	functions.
+	(aarch64_can_change_mode_class): For big-endian, forbid changes
+	between two SVE modes if they have different element sizes.
+
 2018-02-01  Richard Sandiford  <richard.sandiford@linaro.org>
 
 	* config/aarch64/aarch64.c (aarch64_expand_sve_const_vector): Prefer
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index ef1b0bc8e28..cda2895d28e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -447,6 +447,8 @@ void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
 void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
+bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
+void aarch64_split_sve_subreg_move (rtx, rtx, rtx);
 void aarch64_expand_prologue (void);
 void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 068fd8cbf81..9140862d747 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -84,6 +84,32 @@
 				      gen_vec_duplicate<mode>);
 	DONE;
       }
+
+    /* Optimize subregs on big-endian targets: we can use REV[BHW]
+       instead of going through memory.  */
+    if (BYTES_BIG_ENDIAN
+        && aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
+      DONE;
+  }
+)
+
+;; A pattern for optimizing SUBREGs that have a reinterpreting effect
+;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
+;; for details.  We use a special predicate for operand 2 to reduce
+;; the number of patterns.
+(define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
+  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
+	(unspec:SVE_ALL
+          [(match_operand:VNx16BI 1 "register_operand" "Upl")
+	   (match_operand 2 "aarch64_any_register_operand" "w")]
+	  UNSPEC_REV_SUBREG))]
+  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
+    DONE;
   }
 )
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6296ffe959f..7b34bdf5cf4 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3074,6 +3074,120 @@ aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
   aarch64_emit_sve_pred_move (dest, ptrue, src);
 }
 
+/* Called only on big-endian targets.  See whether an SVE vector move
+   from SRC to DEST is effectively a REV[BHW] instruction, because at
+   least one operand is a subreg of an SVE vector that has wider or
+   narrower elements.  Return true and emit the instruction if so.
+
+   For example:
+
+     (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
+
+   represents a VIEW_CONVERT between the following vectors, viewed
+   in memory order:
+
+     R2: { [0].high, [0].low,  [1].high, [1].low, ... }
+     R1: { [0],      [1],      [2],      [3],     ... }
+
+   The high part of lane X in R2 should therefore correspond to lane X*2
+   of R1, but the register representations are:
+
+         msb                                      lsb
+     R2: ...... [1].high  [1].low   [0].high  [0].low
+     R1: ...... [3]       [2]       [1]       [0]
+
+   where the low part of lane X in R2 corresponds to lane X*2 in R1.
+   We therefore need a reverse operation to swap the high and low values
+   around.
+
+   This is purely an optimization.  Without it we would spill the
+   subreg operand to the stack in one mode and reload it in the
+   other mode, which has the same effect as the REV.  */
+
+bool
+aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
+{
+  gcc_assert (BYTES_BIG_ENDIAN);
+  if (GET_CODE (dest) == SUBREG)
+    dest = SUBREG_REG (dest);
+  if (GET_CODE (src) == SUBREG)
+    src = SUBREG_REG (src);
+
+  /* The optimization handles two single SVE REGs with different element
+     sizes.  */
+  if (!REG_P (dest)
+      || !REG_P (src)
+      || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
+      || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
+      || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
+	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
+    return false;
+
+  /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
+  rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
+  rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
+			       UNSPEC_REV_SUBREG);
+  emit_insn (gen_rtx_SET (dest, unspec));
+  return true;
+}
+
+/* Return a copy of X with mode MODE, without changing its other
+   attributes.  Unlike gen_lowpart, this doesn't care whether the
+   mode change is valid.  */
+
+static rtx
+aarch64_replace_reg_mode (rtx x, machine_mode mode)
+{
+  if (GET_MODE (x) == mode)
+    return x;
+
+  x = shallow_copy_rtx (x);
+  set_mode_and_regno (x, mode, REGNO (x));
+  return x;
+}
+
+/* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
+   operands.  */
+
+void
+aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
+{
+  /* Decide which REV operation we need.  The mode with narrower elements
+     determines the mode of the operands and the mode with the wider
+     elements determines the reverse width.  */
+  machine_mode mode_with_wider_elts = GET_MODE (dest);
+  machine_mode mode_with_narrower_elts = GET_MODE (src);
+  if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
+      < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
+    std::swap (mode_with_wider_elts, mode_with_narrower_elts);
+
+  unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
+  unsigned int unspec;
+  if (wider_bytes == 8)
+    unspec = UNSPEC_REV64;
+  else if (wider_bytes == 4)
+    unspec = UNSPEC_REV32;
+  else if (wider_bytes == 2)
+    unspec = UNSPEC_REV16;
+  else
+    gcc_unreachable ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+
+  /* Emit:
+
+       (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
+			 UNSPEC_MERGE_PTRUE))
+
+     with the appropriate modes.  */
+  ptrue = gen_lowpart (pred_mode, ptrue);
+  dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
+  src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
+  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
+  src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
+			UNSPEC_MERGE_PTRUE);
+  emit_insn (gen_rtx_SET (dest, src));
+}
+
 static bool
 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
 				 tree exp ATTRIBUTE_UNUSED)
@@ -17197,10 +17311,27 @@ static bool
 aarch64_can_change_mode_class (machine_mode from,
 			       machine_mode to, reg_class_t)
 {
-  /* See the comment at the head of aarch64-sve.md for details.  */
-  if (BYTES_BIG_ENDIAN
-      && (aarch64_sve_data_mode_p (from) != aarch64_sve_data_mode_p (to)))
-    return false;
+  if (BYTES_BIG_ENDIAN)
+    {
+      bool from_sve_p = aarch64_sve_data_mode_p (from);
+      bool to_sve_p = aarch64_sve_data_mode_p (to);
+
+      /* Don't allow changes between SVE data modes and non-SVE modes.
+	 See the comment at the head of aarch64-sve.md for details.  */
+      if (from_sve_p != to_sve_p)
+	return false;
+
+      /* Don't allow changes in element size: lane 0 of the new vector
+	 would not then be lane 0 of the old vector.  See the comment
+	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
+	 description.
+
+	 In the worst case, this forces a register to be spilled in
+	 one mode and reloaded in the other, which handles the
+	 endianness correctly.  */
+      if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
+	return false;
+    }
   return true;
 }
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 49095f8f3d9..5a2a9309a3b 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -168,6 +168,7 @@
     UNSPEC_INSR
     UNSPEC_CLASTB
     UNSPEC_FADDA
+    UNSPEC_REV_SUBREG
 ])
 
 (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 159e74a1ed7..804be16a13a 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -617,3 +617,7 @@
 (define_predicate "aarch64_gather_scale_operand_d"
   (and (match_code "const_int")
        (match_test "INTVAL (op) == 1 || INTVAL (op) == 8")))
+
+;; A special predicate that doesn't match a particular mode.
+(define_special_predicate "aarch64_any_register_operand"
+  (match_code "reg"))