[AArch64] Improve SVE constant moves

author Richard Sandiford <richard.sandiford@arm.com>

Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 004d270bab78c9dfdbf76e22a28a766687d63317..4da505ed62b914382aaef891f80e0ad6da26ecf1 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,39 @@
+2019-08-13  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * machmode.h (opt_mode::else_mode): New function.
+       (opt_mode::else_blk): Use it.
+       * config/aarch64/aarch64-protos.h (aarch64_vq_mode): Declare.
+       (aarch64_full_sve_mode, aarch64_sve_ld1rq_operand_p): Likewise.
+       (aarch64_gen_stepped_int_parallel): Likewise.
+       (aarch64_stepped_int_parallel_p): Likewise.
+       (aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
+       argument.
+       * config/aarch64/aarch64.c
+       (aarch64_expand_sve_widened_duplicate): Delete.
+       (aarch64_expand_sve_dupq, aarch64_expand_sve_ld1rq): New functions.
+       (aarch64_expand_sve_const_vector): Rewrite to handle more cases.
+       (aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
+       argument.  Use early returns in the !CONST_INT_P handling.
+       Pass all SVE data vectors to aarch64_expand_sve_const_vector rather
+       than handling some inline.
+       (aarch64_full_sve_mode, aarch64_vq_mode): New functions, split out
+       from...
+       (aarch64_simd_container_mode): ...here.
+       (aarch64_gen_stepped_int_parallel, aarch64_stepped_int_parallel_p)
+       (aarch64_sve_ld1rq_operand_p): New functions.
+       * config/aarch64/predicates.md (descending_int_parallel)
+       (aarch64_sve_ld1rq_operand): New predicates.
+       * config/aarch64/constraints.md (UtQ): New constraint.
+       * config/aarch64/aarch64.md (UNSPEC_REINTERPRET): New unspec.
+       * config/aarch64/aarch64-sve.md (mov<SVE_ALL:mode>): Remove the
+       gen_vec_duplicate from call to aarch64_expand_mov_immediate.
+       (@aarch64_sve_reinterpret<mode>): New expander.
+       (*aarch64_sve_reinterpret<mode>): New pattern.
+       (@aarch64_vec_duplicate_vq<mode>_le): New pattern.
+       (@aarch64_vec_duplicate_vq<mode>_be): Likewise.
+       (*sve_ld1rq<Vesize>): Replace with...
+       (@aarch64_sve_ld1rq<mode>): ...this new pattern.
+
  2019-08-13  Wilco Dijkstra  <wdijkstr@arm.com>
  
         * config/aarch64/aarch64.c (generic_tunings): Set function alignment to
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 3f5a348cc179c00debe81e9a869eb3265869f6a5..ad818a4ec7fcf4a4ea7a27f588689904b75ebeaf 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -416,6 +416,8 @@ unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
  bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
  int aarch64_branch_cost (bool, bool);
  enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
+opt_machine_mode aarch64_vq_mode (scalar_mode);
+opt_machine_mode aarch64_full_sve_mode (scalar_mode);
  bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
  bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
  bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
@@ -504,9 +506,12 @@ rtx aarch64_return_addr (int, rtx);
  rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
  bool aarch64_simd_mem_operand_p (rtx);
  bool aarch64_sve_ld1r_operand_p (rtx);
+bool aarch64_sve_ld1rq_operand_p (rtx);
  bool aarch64_sve_ldr_operand_p (rtx);
  bool aarch64_sve_struct_memory_operand_p (rtx);
  rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
+rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int);
+bool aarch64_stepped_int_parallel_p (rtx, int);
  rtx aarch64_tls_get_addr (void);
  tree aarch64_fold_builtin (tree, int, tree *, bool);
  unsigned aarch64_dbx_register_number (unsigned);
@@ -518,7 +523,7 @@ const char * aarch64_output_probe_stack_range (rtx, rtx);
  const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
  void aarch64_err_no_fpadvsimd (machine_mode);
  void aarch64_expand_epilogue (bool);
-void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
+void aarch64_expand_mov_immediate (rtx, rtx);
  rtx aarch64_ptrue_reg (machine_mode);
  rtx aarch64_pfalse_reg (machine_mode);
  void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index fcac23b769dec28a6533c74db030417451d95d32..950f39781af3c92278b6b7bfbd607cf0bd5d169a 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -207,8 +207,7 @@
  
      if (CONSTANT_P (operands[1]))
        {
-       aarch64_expand_mov_immediate (operands[0], operands[1],
-                                     gen_vec_duplicate<mode>);
+       aarch64_expand_mov_immediate (operands[0], operands[1]);
         DONE;
        }
  
@@ -326,6 +325,39 @@
    }
  )
  
+;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
+;; This is equivalent to a subreg on little-endian targets but not for
+;; big-endian; see the comment at the head of the file for details.
+(define_expand "@aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+       (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand")]
+                       UNSPEC_REINTERPRET))]
+  "TARGET_SVE"
+  {
+    if (!BYTES_BIG_ENDIAN)
+      {
+       emit_move_insn (operands[0], gen_lowpart (<MODE>mode, operands[1]));
+       DONE;
+      }
+  }
+)
+
+;; A pattern for handling type punning on big-endian targets.  We use a
+;; special predicate for operand 1 to reduce the number of patterns.
+(define_insn_and_split "*aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand" "0")]
+                       UNSPEC_REINTERPRET))]
+  "TARGET_SVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  {
+    emit_note (NOTE_INSN_DELETED);
+    DONE;
+  }
+)
+
  ;; -------------------------------------------------------------------------
  ;; ---- Moves of multiple vectors
  ;; -------------------------------------------------------------------------
@@ -787,6 +819,39 @@
    [(set_attr "length" "4,4,8")]
  )
  
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "@aarch64_vec_duplicate_vq<mode>_le"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (vec_duplicate:SVE_ALL
+         (match_operand:<V128> 1 "register_operand" "w")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
+;; The SVE register layout puts memory lane N into (architectural)
+;; register lane N, whereas the Advanced SIMD layout puts the memory
+;; lsb into the register lsb.  We therefore have to describe this in rtl
+;; terms as a reverse of the V128 vector followed by a duplicate.
+(define_insn "@aarch64_vec_duplicate_vq<mode>_be"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (vec_duplicate:SVE_ALL
+         (vec_select:<V128>
+           (match_operand:<V128> 1 "register_operand" "w")
+           (match_operand 2 "descending_int_parallel"))))]
+  "TARGET_SVE
+   && BYTES_BIG_ENDIAN
+   && known_eq (INTVAL (XVECEXP (operands[2], 0, 0)),
+               GET_MODE_NUNITS (<V128>mode) - 1)"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
  ;; This is used for vec_duplicate<mode>s from memory, but can also
  ;; be used by combine to optimize selects of a a vec_duplicate<mode>
  ;; with zero.
@@ -802,17 +867,19 @@
    "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
  )
  
-;; Load 128 bits from memory and duplicate to fill a vector.  Since there
-;; are so few operations on 128-bit "elements", we don't define a VNx1TI
-;; and simply use vectors of bytes instead.
-(define_insn "*sve_ld1rq<Vesize>"
+;; Load 128 bits from memory under predicate control and duplicate to
+;; fill a vector.
+(define_insn "@aarch64_sve_ld1rq<mode>"
    [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
         (unspec:SVE_ALL
-         [(match_operand:<VPRED> 1 "register_operand" "Upl")
-          (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
+         [(match_operand:<VPRED> 2 "register_operand" "Upl")
+          (match_operand:<V128> 1 "aarch64_sve_ld1rq_operand" "UtQ")]
           UNSPEC_LD1RQ))]
    "TARGET_SVE"
-  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
+  {
+    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
+    return "ld1rq<Vesize>\t%0.<Vetype>, %2/z, %1";
+  }
  )
  
  ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index af4d8afaa5a58adbd6d17f87732bbf7028934f0b..fe968459241b9edf3eabfb890c48aabff1dd63ff 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3242,32 +3242,55 @@ aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
    emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
  }
  
-/* Try to duplicate SRC into SVE register DEST, given that SRC is an
-   integer of mode INT_MODE.  Return true on success.  */
+/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
+   register of mode MODE.  Use TARGET for the result if it's nonnull
+   and convenient.
+
+   The two vector modes must have the same element mode.  The behavior
+   is to duplicate architectural lane N of SRC into architectural lanes
+   N + I * STEP of the result.  On big-endian targets, architectural
+   lane 0 of an Advanced SIMD vector is the last element of the vector
+   in memory layout, so for big-endian targets this operation has the
+   effect of reversing SRC before duplicating it.  Callers need to
+   account for this.  */
  
-static bool
-aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
-                                     rtx src)
-{
-  /* If the constant is smaller than 128 bits, we can do the move
-     using a vector of SRC_MODEs.  */
-  if (src_mode != TImode)
-    {
-      poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
-                                    GET_MODE_SIZE (src_mode));
-      machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
-      emit_move_insn (gen_lowpart (dup_mode, dest),
-                     gen_const_vec_duplicate (dup_mode, src));
-      return true;
+rtx
+aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
+{
+  machine_mode src_mode = GET_MODE (src);
+  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
+  insn_code icode = (BYTES_BIG_ENDIAN
+                    ? code_for_aarch64_vec_duplicate_vq_be (mode)
+                    : code_for_aarch64_vec_duplicate_vq_le (mode));
+
+  unsigned int i = 0;
+  expand_operand ops[3];
+  create_output_operand (&ops[i++], target, mode);
+  create_output_operand (&ops[i++], src, src_mode);
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* Create a PARALLEL describing the reversal of SRC.  */
+      unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
+      rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
+                                                 nelts_per_vq - 1, -1);
+      create_fixed_operand (&ops[i++], sel);
      }
+  expand_insn (icode, i, ops);
+  return ops[0].value;
+}
  
-  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
-  src = force_const_mem (src_mode, src);
+/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
+   the memory image into DEST.  Return true on success.  */
+
+static bool
+aarch64_expand_sve_ld1rq (rtx dest, rtx src)
+{
+  src = force_const_mem (GET_MODE (src), src);
    if (!src)
      return false;
  
    /* Make sure that the address is legitimate.  */
-  if (!aarch64_sve_ld1r_operand_p (src))
+  if (!aarch64_sve_ld1rq_operand_p (src))
      {
        rtx addr = force_reg (Pmode, XEXP (src, 0));
        src = replace_equiv_address (src, addr);
@@ -3277,46 +3300,127 @@ aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
    unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
    machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
    rtx ptrue = aarch64_ptrue_reg (pred_mode);
-  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
-  emit_insn (gen_rtx_SET (dest, src));
+  emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
    return true;
  }
  
-/* Expand a move of general CONST_VECTOR SRC into DEST, given that it
-   isn't a simple duplicate or series.  */
+/* Return a register containing CONST_VECTOR SRC, given that SRC has an
+   SVE data mode and isn't a legitimate constant.  Use TARGET for the
+   result if convenient.
  
-static void
-aarch64_expand_sve_const_vector (rtx dest, rtx src)
+   The returned register can have whatever mode seems most natural
+   given the contents of SRC.  */
+
+static rtx
+aarch64_expand_sve_const_vector (rtx target, rtx src)
  {
    machine_mode mode = GET_MODE (src);
    unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
    unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
-  gcc_assert (npatterns > 1);
+  scalar_mode elt_mode = GET_MODE_INNER (mode);
+  unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+
+  if (nelts_per_pattern == 1 && encoded_bits == 128)
+    {
+      /* The constant is a duplicated quadword but can't be narrowed
+        beyond a quadword.  Get the memory image of the first quadword
+        as a 128-bit vector and try using LD1RQ to load it from memory.
+
+        The effect for both endiannesses is to load memory lane N into
+        architectural lanes N + I * STEP of the result.  On big-endian
+        targets, the layout of the 128-bit vector in an Advanced SIMD
+        register would be different from its layout in an SVE register,
+        but this 128-bit vector is a memory value only.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
+      if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
+       return target;
+    }
+
+  if (nelts_per_pattern == 1 && encoded_bits < 128)
+    {
+      /* The vector is a repeating sequence of 64 bits or fewer.
+        See if we can load them using an Advanced SIMD move and then
+        duplicate it to fill a vector.  This is better than using a GPR
+        move because it keeps everything in the same register file.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx_vector_builder builder (vq_mode, npatterns, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
+       {
+         /* We want memory lane N to go into architectural lane N,
+            so reverse for big-endian targets.  The DUP .Q pattern
+            has a compensating reverse built-in.  */
+         unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
+         builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
+       }
+      rtx vq_src = builder.build ();
+      if (aarch64_simd_valid_immediate (vq_src, NULL))
+       {
+         vq_src = force_reg (vq_mode, vq_src);
+         return aarch64_expand_sve_dupq (target, mode, vq_src);
+       }
  
-  if (nelts_per_pattern == 1)
-    {
-      /* The constant is a repeating seqeuence of at least two elements,
-        where the repeating elements occupy no more than 128 bits.
-        Get an integer representation of the replicated value.  */
-      scalar_int_mode int_mode;
-      if (BYTES_BIG_ENDIAN)
-       /* For now, always use LD1RQ to load the value on big-endian
-          targets, since the handling of smaller integers includes a
-          subreg that is semantically an element reverse.  */
-       int_mode = TImode;
-      else
+      /* Get an integer representation of the repeating part of Advanced
+        SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
+        which for big-endian targets is lane-swapped wrt a normal
+        Advanced SIMD vector.  This means that for both endiannesses,
+        memory lane N of SVE vector SRC corresponds to architectural
+        lane N of a register holding VQ_SRC.  This in turn means that
+        memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
+        as a single 128-bit value) and thus that memory lane 0 of SRC is
+        in the lsb of the integer.  Duplicating the integer therefore
+        ensures that memory lane N of SRC goes into architectural lane
+        N + I * INDEX of the SVE register.  */
+      scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
+      rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
+      if (elt_value)
         {
-         unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
-         gcc_assert (int_bits <= 128);
-         int_mode = int_mode_for_size (int_bits, 0).require ();
+         /* Pretend that we had a vector of INT_MODE to start with.  */
+         elt_mode = int_mode;
+         mode = aarch64_full_sve_mode (int_mode).require ();
+
+         /* If the integer can be moved into a general register by a
+            single instruction, do that and duplicate the result.  */
+         if (CONST_INT_P (elt_value)
+             && aarch64_move_imm (INTVAL (elt_value), elt_mode))
+           {
+             elt_value = force_reg (elt_mode, elt_value);
+             return expand_vector_broadcast (mode, elt_value);
+           }
         }
-      rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
-      if (int_value
-         && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
-       return;
+      else if (npatterns == 1)
+       /* We're duplicating a single value, but can't do better than
+          force it to memory and load from there.  This handles things
+          like symbolic constants.  */
+       elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
+
+      if (elt_value)
+       {
+         /* Load the element from memory if we can, otherwise move it into
+            a register and use a DUP.  */
+         rtx op = force_const_mem (elt_mode, elt_value);
+         if (!op)
+           op = force_reg (elt_mode, elt_value);
+         return expand_vector_broadcast (mode, op);
+       }
+    }
+
+  /* Try using INDEX.  */
+  rtx base, step;
+  if (const_vec_series_p (src, &base, &step))
+    {
+      aarch64_expand_vec_series (target, base, step);
+      return target;
      }
  
+  /* From here on, it's better to force the whole constant to memory
+     if we can.  */
+  if (GET_MODE_NUNITS (mode).is_constant ())
+    return NULL_RTX;
+
    /* Expand each pattern individually.  */
+  gcc_assert (npatterns > 1);
    rtx_vector_builder builder;
    auto_vec<rtx, 16> vectors (npatterns);
    for (unsigned int i = 0; i < npatterns; ++i)
@@ -3333,22 +3437,20 @@ aarch64_expand_sve_const_vector (rtx dest, rtx src)
        npatterns /= 2;
        for (unsigned int i = 0; i < npatterns; ++i)
         {
-         rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
+         rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
           vectors[i] = tmp;
         }
      }
-  gcc_assert (vectors[0] == dest);
+  gcc_assert (vectors[0] == target);
+  return target;
  }
  
-/* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
-   is a pattern that can be used to set DEST to a replicated scalar
-   element.  */
+/* Set DEST to immediate IMM.  */
  
  void
-aarch64_expand_mov_immediate (rtx dest, rtx imm,
-                             rtx (*gen_vec_duplicate) (rtx, rtx))
+aarch64_expand_mov_immediate (rtx dest, rtx imm)
  {
    machine_mode mode = GET_MODE (dest);
  
@@ -3471,38 +3573,24 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm,
  
    if (!CONST_INT_P (imm))
      {
-      rtx base, step, value;
        if (GET_CODE (imm) == HIGH
           || aarch64_simd_valid_immediate (imm, NULL))
-       emit_insn (gen_rtx_SET (dest, imm));
-      else if (const_vec_series_p (imm, &base, &step))
-       aarch64_expand_vec_series (dest, base, step);
-      else if (const_vec_duplicate_p (imm, &value))
         {
-         /* If the constant is out of range of an SVE vector move,
-            load it from memory if we can, otherwise move it into
-            a register and use a DUP.  */
-         scalar_mode inner_mode = GET_MODE_INNER (mode);
-         rtx op = force_const_mem (inner_mode, value);
-         if (!op)
-           op = force_reg (inner_mode, value);
-         else if (!aarch64_sve_ld1r_operand_p (op))
-           {
-             rtx addr = force_reg (Pmode, XEXP (op, 0));
-             op = replace_equiv_address (op, addr);
-           }
-         emit_insn (gen_vec_duplicate (dest, op));
-       }
-      else if (GET_CODE (imm) == CONST_VECTOR
-              && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
-       aarch64_expand_sve_const_vector (dest, imm);
-      else
-       {
-         rtx mem = force_const_mem (mode, imm);
-         gcc_assert (mem);
-         emit_move_insn (dest, mem);
+         emit_insn (gen_rtx_SET (dest, imm));
+         return;
         }
  
+      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
+       if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
+         {
+           if (dest != res)
+             emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
+           return;
+         }
+
+      rtx mem = force_const_mem (mode, imm);
+      gcc_assert (mem);
+      emit_move_insn (dest, mem);
        return;
      }
  
@@ -14172,55 +14260,71 @@ aarch64_vector_mode_supported_p (machine_mode mode)
    return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
  }
  
+/* Return the full-width SVE vector mode for element mode MODE, if one
+   exists.  */
+opt_machine_mode
+aarch64_full_sve_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return VNx2DFmode;
+    case E_SFmode:
+      return VNx4SFmode;
+    case E_HFmode:
+      return VNx8HFmode;
+    case E_DImode:
+       return VNx2DImode;
+    case E_SImode:
+      return VNx4SImode;
+    case E_HImode:
+      return VNx8HImode;
+    case E_QImode:
+      return VNx16QImode;
+    default:
+      return opt_machine_mode ();
+    }
+}
+
+/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
+   if it exists.  */
+opt_machine_mode
+aarch64_vq_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return V2DFmode;
+    case E_SFmode:
+      return V4SFmode;
+    case E_HFmode:
+      return V8HFmode;
+    case E_SImode:
+      return V4SImode;
+    case E_HImode:
+      return V8HImode;
+    case E_QImode:
+      return V16QImode;
+    case E_DImode:
+      return V2DImode;
+    default:
+      return opt_machine_mode ();
+    }
+}
+
  /* Return appropriate SIMD container
     for MODE within a vector of WIDTH bits.  */
  static machine_mode
  aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
  {
    if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
-    switch (mode)
-      {
-      case E_DFmode:
-       return VNx2DFmode;
-      case E_SFmode:
-       return VNx4SFmode;
-      case E_HFmode:
-       return VNx8HFmode;
-      case E_DImode:
-       return VNx2DImode;
-      case E_SImode:
-       return VNx4SImode;
-      case E_HImode:
-       return VNx8HImode;
-      case E_QImode:
-       return VNx16QImode;
-      default:
-       return word_mode;
-      }
+    return aarch64_full_sve_mode (mode).else_mode (word_mode);
  
    gcc_assert (known_eq (width, 64) || known_eq (width, 128));
    if (TARGET_SIMD)
      {
        if (known_eq (width, 128))
-       switch (mode)
-         {
-         case E_DFmode:
-           return V2DFmode;
-         case E_SFmode:
-           return V4SFmode;
-         case E_HFmode:
-           return V8HFmode;
-         case E_SImode:
-           return V4SImode;
-         case E_HImode:
-           return V8HImode;
-         case E_QImode:
-           return V16QImode;
-         case E_DImode:
-           return V2DImode;
-         default:
-           break;
-         }
+       return aarch64_vq_mode (mode).else_mode (word_mode);
        else
         switch (mode)
           {
@@ -14946,6 +15050,36 @@ aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
    return true;
  }
  
+/* Return a PARALLEL containing NELTS elements, with element I equal
+   to BASE + I * STEP.  */
+
+rtx
+aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
+{
+  rtvec vec = rtvec_alloc (nelts);
+  for (unsigned int i = 0; i < nelts; ++i)
+    RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
+  return gen_rtx_PARALLEL (VOIDmode, vec);
+}
+
+/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
+   series with step STEP.  */
+
+bool
+aarch64_stepped_int_parallel_p (rtx op, int step)
+{
+  if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
+    return false;
+
+  unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
+  for (int i = 1; i < XVECLEN (op, 0); ++i)
+    if (!CONST_INT_P (XVECEXP (op, 0, i))
+       || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
+      return false;
+
+  return true;
+}
+
  /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
     HIGH (exclusive).  */
  void
@@ -14998,6 +15132,25 @@ aarch64_sve_ld1r_operand_p (rtx op)
           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
  }
  
+/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
+bool
+aarch64_sve_ld1rq_operand_p (rtx op)
+{
+  struct aarch64_address_info addr;
+  scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
+  if (!MEM_P (op)
+      || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
+    return false;
+
+  if (addr.type == ADDRESS_REG_IMM)
+    return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
+
+  if (addr.type == ADDRESS_REG_REG)
+    return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
+
+  return false;
+}
+
  /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
     The conditions for STR are the same.  */
  bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 873f2760cceaac756ca5d96ab35873ce15a5501b..a8dd070333d40092854994553a3dc0d05b458463 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -234,6 +234,7 @@
      UNSPEC_CLASTB
      UNSPEC_FADDA
      UNSPEC_REV_SUBREG
+    UNSPEC_REINTERPRET
      UNSPEC_SPECULATION_TRACKER
      UNSPEC_COPYSIGN
      UNSPEC_TTEST               ; Represent transaction test.
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md

index 6763d3db129add13227c4fcd9bc4e70020b30cb8..cbeaceb3df9485a30cb72a4b30035b9d6a10cdba 100644 (file)
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -272,6 +272,12 @@
         (match_test "aarch64_legitimate_address_p (V2DImode,
                                                   XEXP (op, 0), 1)")))
  
+(define_memory_constraint "UtQ"
+  "@internal
+   An address valid for SVE LD1RQs."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
  (define_memory_constraint "Uty"
    "@internal
     An address valid for SVE LD1Rs."
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md

index 3a8b507cbc75a73e9118a162da4c54249399c177..5d229f8cc0ee9cf8dea96b9a2289f08c2f85190c 100644 (file)
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -431,6 +431,12 @@
    return aarch64_simd_check_vect_par_cnst_half (op, mode, false);
  })
  
+(define_predicate "descending_int_parallel"
+  (match_code "parallel")
+{
+  return aarch64_stepped_int_parallel_p (op, -1);
+})
+
  (define_special_predicate "aarch64_simd_lshift_imm"
    (match_code "const,const_vector")
  {
@@ -543,6 +549,10 @@
    (and (match_operand 0 "memory_operand")
         (match_test "aarch64_sve_ld1r_operand_p (op)")))
  
+(define_predicate "aarch64_sve_ld1rq_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
  ;; Like memory_operand, but restricted to addresses that are valid for
  ;; SVE LDR and STR instructions.
  (define_predicate "aarch64_sve_ldr_operand"
diff --git a/gcc/machmode.h b/gcc/machmode.h

index 3a7cee88962b301bb0a6f33de086ac958719a705..005ec80e89d273edaee9cc26e26981353b8f3476 100644 (file)
--- a/gcc/machmode.h
+++ b/gcc/machmode.h
@@ -251,7 +251,8 @@ public:
    ALWAYS_INLINE opt_mode (from_int m) : m_mode (machine_mode (m)) {}
  
    machine_mode else_void () const;
-  machine_mode else_blk () const;
+  machine_mode else_blk () const { return else_mode (BLKmode); }
+  machine_mode else_mode (machine_mode) const;
    T require () const;
  
    bool exists () const;
@@ -271,13 +272,13 @@ opt_mode<T>::else_void () const
    return m_mode;
  }
  
-/* If the T exists, return its enum value, otherwise return E_BLKmode.  */
+/* If the T exists, return its enum value, otherwise return FALLBACK.  */
  
  template<typename T>
  inline machine_mode
-opt_mode<T>::else_blk () const
+opt_mode<T>::else_mode (machine_mode fallback) const
  {
-  return m_mode == E_VOIDmode ? E_BLKmode : m_mode;
+  return m_mode == E_VOIDmode ? fallback : m_mode;
  }
  
  /* Assert that the object contains a T and return it.  */
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 29746939dc642538133a5e3e0fba3043a06d425e..11bbee10f0af1a58068e5d86b7875d4d5a5838c9 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,22 @@
+2019-08-13  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * gcc.target/aarch64/sve/init_2.c: Expect ld1rd to be used
+       instead of a full vector load.
+       * gcc.target/aarch64/sve/init_4.c: Likewise.
+       * gcc.target/aarch64/sve/ld1r_2.c: Remove constants that no longer
+       need to be loaded from memory.
+       * gcc.target/aarch64/sve/slp_2.c: Expect the same output for
+       big and little endian.
+       * gcc.target/aarch64/sve/slp_3.c: Likewise.  Expect 3 of the
+       doubles to be moved via integer registers rather than loaded
+       from memory.
+       * gcc.target/aarch64/sve/slp_4.c: Likewise but for 4 doubles.
+       * gcc.target/aarch64/sve/spill_4.c: Expect 16-bit constants to be
+       loaded via an integer register rather than from memory.
+       * gcc.target/aarch64/sve/const_1.c: New test.
+       * gcc.target/aarch64/sve/const_2.c: Likewise.
+       * gcc.target/aarch64/sve/const_3.c: Likewise.
+
  2019-08-13  Jozef Lawrynowicz  <jozef.l@mittosystems.com>
  
         * gcc.target/msp430/msp430.exp (msp430_device_permutations_runtest):
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_1.c b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c

new file mode 100644 (file)

index 0000000..ae25dcb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void
+set (uint64_t *dst, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = 0xffff00ff00ffff00ULL;
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.2d, 0xffff00ff00ffff00\n.*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_2.c b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c

new file mode 100644 (file)

index 0000000..7b2b5c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)                      \
+  void                                         \
+  set_##TYPE (TYPE *dst, int count)            \
+  {                                            \
+    for (int i = 0; i < count; ++i)            \
+      dst[i] = CONST;                          \
+  }
+
+TEST (uint16_t, 129)
+TEST (uint32_t, 129)
+TEST (uint64_t, 129)
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.8h, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.4s, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 129\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_3.c b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c

new file mode 100644 (file)

index 0000000..c18ceae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_3.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)                      \
+  void                                         \
+  set_##TYPE (TYPE *dst, int count)            \
+  {                                            \
+    for (int i = 0; i < count; ++i)            \
+      dst[i] = CONST;                          \
+  }
+
+TEST (uint16_t, 0x1234)
+TEST (uint32_t, 0x1234)
+TEST (uint64_t, 0x1234)
+
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.h, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.s, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c

index 83bd999d21edf7c8fdde6b4f72fb971a7632c21d..0a8aa8decd14495192ee4e7486b563c0ddeab493 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_2.c
@@ -11,9 +11,9 @@ typedef int32_t vnx4si __attribute__((vector_size (32)));
  /*
  ** foo:
  **     ...
-**     ld1w    (z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-**     insr    \1, w1
-**     insr    \1, w0
+**     ld1rd   (z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**     insr    \1\.s, w1
+**     insr    \1\.s, w0
  **     ...
  */
  __attribute__((noipa))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c

index fa914488adc6579c9a417addbb81d3370be745a8..0fa99c15195d50761fa99759c89497716e9635a0 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/init_4.c
@@ -11,10 +11,10 @@ typedef int32_t vnx4si __attribute__((vector_size (32)));
  /*
  ** foo:
  **     ...
-**     ld1w    (z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-**     insr    \1, w1
-**     insr    \1, w0
-**     rev     \1, \1
+**     ld1rd   (z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**     insr    \1\.s, w1
+**     insr    \1\.s, w0
+**     rev     \1\.s, \1\.s
  **     ...
  */
  __attribute__((noipa))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c

index 2e6b59ab458bb731a82173179d743d834bf6cc14..e0e0f4ee65b6aa72fff707b7601d4d94fe31dbf4 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
@@ -28,22 +28,6 @@
    T (int64_t)
  
  #define FOR_EACH_LOAD_BROADCAST_IMM(T)                                 \
-  T (int16_t, 129, imm_129)                                            \
-  T (int32_t, 129, imm_129)                                            \
-  T (int64_t, 129, imm_129)                                            \
-                                                                       \
-  T (int16_t, -130, imm_m130)                                          \
-  T (int32_t, -130, imm_m130)                                          \
-  T (int64_t, -130, imm_m130)                                          \
-                                                                       \
-  T (int16_t, 0x1234, imm_0x1234)                                      \
-  T (int32_t, 0x1234, imm_0x1234)                                      \
-  T (int64_t, 0x1234, imm_0x1234)                                      \
-                                                                       \
-  T (int16_t, 0xFEDC, imm_0xFEDC)                                      \
-  T (int32_t, 0xFEDC, imm_0xFEDC)                                      \
-  T (int64_t, 0xFEDC, imm_0xFEDC)                                      \
-                                                                       \
    T (int32_t, 0x12345678, imm_0x12345678)                              \
    T (int64_t, 0x12345678, imm_0x12345678)                              \
                                                                         \
@@ -56,6 +40,6 @@ FOR_EACH_LOAD_BROADCAST (DEF_LOAD_BROADCAST)
  FOR_EACH_LOAD_BROADCAST_IMM (DEF_LOAD_BROADCAST_IMM)
  
  /* { dg-final { scan-assembler-times {\tld1rb\tz[0-9]+\.b, p[0-7]/z, } 1 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 5 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 7 } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 8 } } */
+/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 4 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c

index 413532c076fab9fe83db241a7bd97b8db79701d3..d4b9776fe9ba6745a43be0cee1f2e45882b7616e 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
@@ -29,12 +29,9 @@ vec_slp_##TYPE (TYPE *restrict a, int n)                     \
  
  TEST_ALL (VEC_PERM)
  
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
  /* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
  /* { dg-final { scan-assembler-not {\tzip1\t} } } */
  /* { dg-final { scan-assembler-not {\tzip2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c

index 0f9f01a00565b1d6f702b3d784a67a08d570dbaf..82dd43a4d98db9d15ca0a6441a3a5da78bced87f 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
@@ -32,18 +32,17 @@ vec_slp_##TYPE (TYPE *restrict a, int n)                    \
  TEST_ALL (VEC_PERM)
  
  /* 1 for each 8-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* 1 for each 16-bit type and 4 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
+/* 1 for each 16-bit type plus 1 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
  /* 1 for each 32-bit type.  */
  /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
+/* 3 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
  /* The 64-bit types need:
  
        ZIP1 ZIP1 (2 ZIP2s optimized away)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c

index 8d9d5ab5887d66729fbc20cb3be31a3ca9302c8c..49fb828e874f05d6448325780084f9cfb253fc01 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
@@ -35,10 +35,8 @@ vec_slp_##TYPE (TYPE *restrict a, int n)                     \
  
  TEST_ALL (VEC_PERM)
  
-/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 20 { target aarch64_big_endian } } } */
+/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
  /* 1 for each 16-bit type.  */
  /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
@@ -49,6 +47,8 @@ TEST_ALL (VEC_PERM)
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
+/* 4 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
  /* The 32-bit types need:
  
        ZIP1 ZIP1 (2 ZIP2s optimized away)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c

index 29e1a49dc848ec73e4decb98875aa05514d81088..4398e018f2e3959c3eab44ebf3d651c8494f33bc 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
@@ -24,10 +24,10 @@ TEST_LOOP (uint16_t, 0x1234);
  TEST_LOOP (uint32_t, 0x12345);
  TEST_LOOP (uint64_t, 0x123456);
  
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
  /* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
  /* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
  /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
  /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
  /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
author	Richard Sandiford <richard.sandiford@arm.com>
	Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Tue, 13 Aug 2019 10:40:02 +0000 (10:40 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| history
gcc/config/aarch64/constraints.md		patch \| blob \| history
gcc/config/aarch64/predicates.md		patch \| blob \| history
gcc/machmode.h		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/const_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/const_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/const_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/init_2.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/init_4.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/slp_2.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/slp_3.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/slp_4.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/spill_4.c		patch \| blob \| history