aarch64: Support permutes on unpacked SVE vectors

author Richard Sandiford <richard.sandiford@arm.com>

Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)

committer Richard Sandiford <richard.sandiford@arm.com>

Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)
author Richard Sandiford <richard.sandiford@arm.com>
Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)
committer Richard Sandiford <richard.sandiford@arm.com>
Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)
diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def

index af972e8f72b0d5568afdc26e9d18dc2f747f107e..f304992e3edd47b5e451d2926766cf1298f55d23 100644 (file)
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -136,11 +136,13 @@ ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2BF, aarch64_sve_vg);
  ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
  
  ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
  ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
  ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2);
  
  ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
  
@@ -151,7 +153,9 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
  ADJUST_ALIGNMENT (VNx2HI, 2);
  ADJUST_ALIGNMENT (VNx4HI, 2);
  ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx2BF, 2);
  ADJUST_ALIGNMENT (VNx4HF, 2);
+ADJUST_ALIGNMENT (VNx4BF, 2);
  
  ADJUST_ALIGNMENT (VNx2SI, 4);
  ADJUST_ALIGNMENT (VNx2SF, 4);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index 31a8c5a5aefc24b36c5115157cde0482b7a7927b..4b0a1ebe9e1dd8bcbf683c5c136d9458b61dd943 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3009,6 +3009,22 @@
    "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
  )
  
+;; Another way of expressing the REVB, REVH and REVW patterns, with this
+;; form being easier for permutes.  The predicate mode determines the number
+;; of lanes and the data mode decides the granularity of the reversal within
+;; each lane.
+(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
+         [(match_operand:PRED_HSD 1 "register_operand" "Upl")
+          (unspec:SVE_ALL
+            [(match_operand:SVE_ALL 2 "register_operand" "w")]
+            UNSPEC_REVBHW)]
+         UNSPEC_PRED_X))]
+  "TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>"
+  "rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>"
+)
+
  ;; Predicated integer unary operations with merging.
  (define_insn "@cond_<optab><mode>"
    [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
@@ -8273,14 +8289,14 @@
  
  ;; Duplicate one element of a vector.
  (define_insn "@aarch64_sve_dup_lane<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (vec_duplicate:SVE_ALL
           (vec_select:<VEL>
-           (match_operand:SVE_FULL 1 "register_operand" "w")
+           (match_operand:SVE_ALL 1 "register_operand" "w")
             (parallel [(match_operand:SI 2 "const_int_operand")]))))]
    "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
-  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+   && IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)"
+  "dup\t%0.<Vctype>, %1.<Vctype>[%2]"
  )
  
  ;; Use DUP.Q to duplicate a 128-bit segment of a register.
@@ -8321,17 +8337,18 @@
  
  ;; Reverse the order of elements within a full vector.
  (define_insn "@aarch64_sve_rev<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
-         [(match_operand:SVE_FULL 1 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
+         [(match_operand:SVE_ALL 1 "register_operand" "w")]
           UNSPEC_REV))]
    "TARGET_SVE"
-  "rev\t%0.<Vetype>, %1.<Vetype>")
+  "rev\t%0.<Vctype>, %1.<Vctype>")
  
  ;; -------------------------------------------------------------------------
  ;; ---- [INT,FP] Special-purpose binary permutes
  ;; -------------------------------------------------------------------------
  ;; Includes:
+;; - EXT
  ;; - SPLICE
  ;; - TRN1
  ;; - TRN2
@@ -8359,13 +8376,13 @@
  ;; Permutes that take half the elements from one vector and half the
  ;; elements from the other.
  (define_insn "@aarch64_sve_<perm_insn><mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-       (unspec:SVE_FULL
-         [(match_operand:SVE_FULL 1 "register_operand" "w")
-          (match_operand:SVE_FULL 2 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+       (unspec:SVE_ALL
+         [(match_operand:SVE_ALL 1 "register_operand" "w")
+          (match_operand:SVE_ALL 2 "register_operand" "w")]
           PERMUTE))]
    "TARGET_SVE"
-  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  "<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
  )
  
  ;; Apply PERMUTE to 128-bit sequences.  The behavior of these patterns
@@ -8383,16 +8400,16 @@
  ;; Concatenate two vectors and extract a subvector.  Note that the
  ;; immediate (third) operand is the lane index not the byte index.
  (define_insn "@aarch64_sve_ext<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
-       (unspec:SVE_FULL
-         [(match_operand:SVE_FULL 1 "register_operand" "0, w")
-          (match_operand:SVE_FULL 2 "register_operand" "w, w")
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w")
+       (unspec:SVE_ALL
+         [(match_operand:SVE_ALL 1 "register_operand" "0, w")
+          (match_operand:SVE_ALL 2 "register_operand" "w, w")
            (match_operand:SI 3 "const_int_operand")]
           UNSPEC_EXT))]
    "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+   && IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)"
    {
-    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+    operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8);
      return (which_alternative == 0
             ? "ext\\t%0.b, %0.b, %2.b, #%3"
             : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 0ae6c8b53f6e7ae629bafc2ec033a440012cbe42..97cb68980e975dfb2c0c0c0a05f9153beb64a2ad 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2226,6 +2226,9 @@ aarch64_classify_vector_mode (machine_mode mode)
      /* Partial SVE HF vectors.  */
      case E_VNx2HFmode:
      case E_VNx4HFmode:
+    /* Partial SVE BF vectors.  */
+    case E_VNx2BFmode:
+    case E_VNx4BFmode:
      /* Partial SVE SF vector.  */
      case E_VNx2SFmode:
        return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
@@ -20468,18 +20471,21 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
        || !diff)
      return false;
  
-  size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
-  if (size == 8)
+  if (d->vec_flags & VEC_SVE_DATA)
+    size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
+  else
+    size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
+  if (size == 64)
      {
        unspec = UNSPEC_REV64;
        pred_mode = VNx2BImode;
      }
-  else if (size == 4)
+  else if (size == 32)
      {
        unspec = UNSPEC_REV32;
        pred_mode = VNx4BImode;
      }
-  else if (size == 2)
+  else if (size == 16)
      {
        unspec = UNSPEC_REV16;
        pred_mode = VNx8BImode;
@@ -20496,28 +20502,11 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
    if (d->testing_p)
      return true;
  
-  if (d->vec_flags == VEC_SVE_DATA)
-    {
-      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
-      rtx target = gen_reg_rtx (int_mode);
-      if (BYTES_BIG_ENDIAN)
-       /* The act of taking a subreg between INT_MODE and d->vmode
-          is itself a reversing operation on big-endian targets;
-          see the comment at the head of aarch64-sve.md for details.
-          First reinterpret OP0 as INT_MODE without using a subreg
-          and without changing the contents.  */
-       emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
-      else
-       {
-         /* For SVE we use REV[BHW] unspecs derived from the element size
-            of v->mode and vector modes whose elements have SIZE bytes.
-            This ensures that the vector modes match the predicate modes.  */
-         int unspec = aarch64_sve_rev_unspec (d->vmode);
-         rtx pred = aarch64_ptrue_reg (pred_mode);
-         emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
-                                      gen_lowpart (int_mode, d->op0)));
-       }
-      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+  if (d->vec_flags & VEC_SVE_DATA)
+    {
+      rtx pred = aarch64_ptrue_reg (pred_mode);
+      emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
+                                        d->target, pred, d->op0));
        return true;
      }
    rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
@@ -20562,7 +20551,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
        || !d->perm[0].is_constant (&elt))
      return false;
  
-  if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
+  if ((d->vec_flags & VEC_SVE_DATA)
+      && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
      return false;
  
    /* Success! */
@@ -20782,6 +20772,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
  
    if ((d->vec_flags == VEC_ADVSIMD
         || d->vec_flags == VEC_SVE_DATA
+       || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
         || d->vec_flags == VEC_SVE_PRED)
        && known_gt (nelt, 1))
      {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md

index 054fd8515c6ebf136da699e2993f6ebb348c3b1a..fb1426b7752890848cb49722ef7442d96cb1408b 100644 (file)
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -400,7 +400,7 @@
  (define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
                                VNx8HI VNx4HI VNx2HI
                                VNx8HF VNx4HF VNx2HF
-                              VNx8BF
+                              VNx8BF VNx4BF VNx2BF
                                VNx4SI VNx2SI
                                VNx4SF VNx2SF
                                VNx2DI
@@ -418,11 +418,13 @@
                                 VNx2DI])
  
  ;; SVE modes with 2 or 4 elements.
-(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
-                             VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF
+                             VNx2DI VNx2DF
+                             VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
  
  ;; SVE modes with 2 elements.
-(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
+                            VNx2SI VNx2SF VNx2DI VNx2DF])
  
  ;; SVE integer modes with 2 elements, excluding the widest element.
  (define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
@@ -431,7 +433,7 @@
  (define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI])
  
  ;; SVE modes with 4 elements.
-(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
  
  ;; SVE integer modes with 4 elements, excluding the widest element.
  (define_mode_iterator SVE_4BHI [VNx4QI VNx4HI])
@@ -621,6 +623,7 @@
      UNSPEC_REVB                ; Used in aarch64-sve.md.
      UNSPEC_REVH                ; Used in aarch64-sve.md.
      UNSPEC_REVW                ; Used in aarch64-sve.md.
+    UNSPEC_REVBHW      ; Used in aarch64-sve.md.
      UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
      UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
      UNSPEC_FMLA                ; Used in aarch64-sve.md.
@@ -968,6 +971,16 @@
                              (VNx4SI "32") (VNx2DI "64")
                              (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
  
+;; The number of bits in a vector container.
+(define_mode_attr container_bits [(VNx16QI "8")
+                                 (VNx8HI "16") (VNx8QI "16") (VNx8HF "16")
+                                 (VNx8BF "16")
+                                 (VNx4SI "32") (VNx4HI "32") (VNx4QI "32")
+                                 (VNx4SF "32") (VNx4HF "32") (VNx4BF "32")
+                                 (VNx2DI "64") (VNx2SI "64") (VNx2HI "64")
+                                 (VNx2QI "64") (VNx2DF "64") (VNx2SF "64")
+                                 (VNx2HF "64") (VNx2BF "64")])
+
  ;; Attribute to describe constants acceptable in logical operations
  (define_mode_attr lconst [(SI "K") (DI "L")])
  
@@ -1029,7 +1042,7 @@
                           (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
                           (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
                           (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-                         (VNx8BF "h")
+                         (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
                           (VNx4SI "s") (VNx2SI "s")
                           (VNx4SF "s") (VNx2SF "s")
                           (VNx2DI "d")
@@ -1047,7 +1060,7 @@
  (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
                           (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
                           (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-                         (VNx8BF "h")
+                         (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
                           (VNx4SI "w") (VNx2SI "w")
                           (VNx4SF "w") (VNx2SF "w")
                           (VNx2DI "d")
@@ -1066,12 +1079,23 @@
  (define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
                           (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
                           (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
-                         (VNx8BF "h")
+                         (VNx8BF "h") (VNx4BF "s") (VNx2BF "d")
                           (VNx4SI "s") (VNx2SI "d")
                           (VNx4SF "s") (VNx2SF "d")
                           (VNx2DI "d")
                           (VNx2DF "d")])
  
+;; The instruction mnemonic suffix for an SVE mode's element container,
+;; i.e. the Vewtype of full SVE modes that have the same number of elements.
+(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
+                          (VNx8HI "h") (VNx4HI "w") (VNx2HI "d")
+                          (VNx8HF "h") (VNx4HF "w") (VNx2HF "d")
+                          (VNx8BF "h") (VNx4BF "w") (VNx2BF "d")
+                          (VNx4SI "w") (VNx2SI "d")
+                          (VNx4SF "w") (VNx2SF "d")
+                          (VNx2DI "d")
+                          (VNx2DF "d")])
+
  ;; Vetype is used everywhere in scheduling type and assembly output,
  ;; sometimes they are not the same, for example HF modes on some
  ;; instructions.  stype is defined to represent scheduling type
@@ -1107,7 +1131,7 @@
                        (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
                        (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
                        (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
-                      (VNx8BF "BF")
+                      (VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF")
                        (VNx4SI "SI") (VNx2SI "SI")
                        (VNx4SF "SF") (VNx2SF "SF")
                        (VNx2DI "DI")
@@ -1127,7 +1151,7 @@
                        (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
                        (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
                        (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
-                      (VNx8BF "bf")
+                      (VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf")
                        (VNx4SI "si") (VNx2SI "si")
                        (VNx4SF "sf") (VNx2SF "sf")
                        (VNx2DI "di")
@@ -1310,7 +1334,7 @@
                           (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
                           (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
                           (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
-                         (VNx8BF "w")
+                         (VNx8BF "w") (VNx4BF "w") (VNx2BF "w")
                           (VNx4SI "w") (VNx2SI "w")
                           (VNx4SF "w") (VNx2SF "w")
                           (VNx2DI "x")
@@ -1380,6 +1404,8 @@
                                    (VNx2DI "VNx2DI")
                                    (VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
                                    (VNx2HF "VNx2DI")
+                                  (VNx8BF "VNx8HI") (VNx4BF "VNx4SI")
+                                  (VNx2BF "VNx2DI")
                                    (VNx4SF "VNx4SI") (VNx2SF "VNx2DI")
                                    (VNx2DF "VNx2DI")])
  
@@ -1392,6 +1418,8 @@
                                    (VNx2DI "vnx2di")
                                    (VNx8HF "vnx8hi") (VNx4HF "vnx4si")
                                    (VNx2HF "vnx2di")
+                                  (VNx8BF "vnx8hi") (VNx4BF "vnx4si")
+                                  (VNx2BF "vnx2di")
                                    (VNx4SF "vnx4si") (VNx2SF "vnx2di")
                                    (VNx2DF "vnx2di")])
  
@@ -1617,7 +1645,7 @@
                          (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
                          (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
                          (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
-                        (VNx8BF "VNx8BI")
+                        (VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI")
                          (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
                          (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
                          (VNx2DI "VNx2BI")
@@ -1643,7 +1671,7 @@
                          (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
                          (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
                          (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
-                        (VNx8BF "vnx8bi")
+                        (VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi")
                          (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
                          (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
                          (VNx2DI "vnx2bi")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c

new file mode 100644 (file)

index 0000000..3d74ff9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c
@@ -0,0 +1,331 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+/*
+** qi_dup_h_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.h, \2\.h\[1\]
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_dup_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_dup_h_31:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.h, \2\.h\[31\]
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_dup_h_31 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
+}
+
+/*
+** qi_dup_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[1\]
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_dup_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_dup_s_15:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[15\]
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_dup_s_15 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
+}
+
+/*
+** qi_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_dup_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_dup_d_7:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[7\]
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_dup_d_7 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
+}
+
+/*
+** hi_dup_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[1\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_dup_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_dup_s_15:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[15\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_dup_s_15 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
+}
+
+/*
+** hf_dup_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[1\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_dup_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_dup_s_11:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[11\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_dup_s_11 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
+}
+
+/*
+** bf_dup_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[1\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_dup_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_dup_s_13:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.s, \2\.s\[13\]
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_dup_s_13 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
+}
+
+/*
+** hi_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_dup_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_dup_d_7:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[7\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_dup_d_7 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** hf_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_dup_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_dup_d_5:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[5\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_dup_d_5 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
+}
+
+/*
+** bf_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_dup_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_dup_d_6:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[6\]
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_dup_d_6 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
+}
+
+/*
+** si_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_dup_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_dup_d_7:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[7\]
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_dup_d_7 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
+
+/*
+** sf_dup_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[1\]
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_dup_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_dup_d_7:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     dup     (z[0-9]+)\.d, \2\.d\[7\]
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_dup_d_7 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c

new file mode 100644 (file)

index 0000000..50f73a1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c
@@ -0,0 +1,90 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+v128qi
+qi_dup_h_32 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) });
+}
+
+v64qi
+qi_dup_s_16 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) });
+}
+
+v32qi
+qi_dup_d_8 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) });
+}
+
+v64hi
+hi_dup_s_16 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64hf
+hf_dup_s_16 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64bf
+bf_dup_s_16 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v32hi
+hi_dup_d_8 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32hf
+hf_dup_d_8 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32bf
+bf_dup_d_8 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32si
+si_dup_d_8 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+v32sf
+sf_dup_d_8 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+/* { dg-final { scan-assembler-not {\tdup\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c

new file mode 100644 (file)

index 0000000..4637b5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c
@@ -0,0 +1,353 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 1
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_ext_h_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #2
+**     st1b    \2\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_ext_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_1_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ext     \3\.b, \3\.b, \2\.b, #2
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ext     \4\.b, \4\.b, \5\.b, #2
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_ext_h_1_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_127:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #254
+**     st1b    \2\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_ext_h_127 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_ext_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #4
+**     st1b    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_ext_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_ext_s_63:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #252
+**     st1b    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_ext_s_63 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1b    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_ext_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_ext_d_31:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #248
+**     st1b    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_ext_d_31 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_ext_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #4
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_ext_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_ext_s_63:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #252
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_ext_s_63 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_ext_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #4
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_ext_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_ext_s_60:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #240
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_ext_s_60 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) });
+}
+
+/*
+** bf_ext_s_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #4
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_ext_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_ext_s_40:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #160
+**     st1h    \2\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_ext_s_40 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) });
+}
+
+/*
+** hi_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_ext_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_ext_d_31:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #248
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_ext_d_31 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_ext_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_ext_d_18:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #144
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_ext_d_18 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) });
+}
+
+/*
+** bf_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_ext_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_ext_d_7:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #56
+**     st1h    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_ext_d_7 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** si_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1w    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_ext_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_ext_d_31:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #248
+**     st1w    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_ext_d_31 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_ext_d_1:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #8
+**     st1w    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_ext_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_ext_d_31:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ext     \2\.b, \2\.b, \2\.b, #248
+**     st1w    \2\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_ext_d_31 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c

new file mode 100644 (file)

index 0000000..417da37
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c
@@ -0,0 +1,177 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B - 1
+#define PERM1(B) PERM0 (B), PERM0 (B - 2)
+#define PERM2(B) PERM1 (B), PERM1 (B - 4)
+#define PERM3(B) PERM2 (B), PERM2 (B - 8)
+#define PERM4(B) PERM3 (B), PERM3 (B - 16)
+#define PERM5(B) PERM4 (B), PERM4 (B - 32)
+#define PERM6(B) PERM5 (B), PERM5 (B - 64)
+
+/*
+** qi_rev_h:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_rev_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_rev_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_rev_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_rev_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_rev_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_rev_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_rev_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_rev_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** bf_rev_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_rev_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hi_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_rev_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_rev_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** bf_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_rev_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** si_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_rev_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_rev_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     rev     (z[0-9]+)\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_rev_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c

new file mode 100644 (file)

index 0000000..62de812
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c
@@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     revh    (z[0-9]+)\.s, \1/m, \2\.s
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     revh    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c

new file mode 100644 (file)

index 0000000..7634d01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c
@@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     revh    (z[0-9]+)\.s, \1/m, \2\.s
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     revw    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     revh    (z[0-9]+)\.d, \1/m, \2\.d
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c

new file mode 100644 (file)

index 0000000..fe25000
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+f (short *restrict s, signed char *restrict c)
+{
+  for (int i = 0; i < 8; i += 2)
+    {
+      s[i] = c[i];
+      s[i + 1] = c[i];
+    }
+}
+
+/* Ideally this would use LD1SB, but currently we use LD1B and
+   sign-extend it after the permute.  */
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
+/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c

new file mode 100644 (file)

index 0000000..df059dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn1_h_a:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_trn1_h_b:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_h_c:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_trn1_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn1    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     trn1    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_trn1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_trn1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn1    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_trn1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_trn1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_trn1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn1    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_trn1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_trn1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_trn1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_trn1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_trn1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_trn1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_trn1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_trn1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_trn1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_trn1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_trn1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_trn1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_trn1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_trn1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_trn1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_trn1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c

new file mode 100644 (file)

index 0000000..290ce8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn2_h_a:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_b:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_h_c:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_trn2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     trn2    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     trn2    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_trn2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_trn2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn2    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_trn2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_trn2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** qi_trn2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn2    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_trn2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_trn2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_trn2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_trn2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_trn2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_trn2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     trn2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     trn2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_trn2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_trn2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_trn2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_trn2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_trn2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_trn2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     trn2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_trn2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** si_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_trn2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
+
+/*
+** sf_trn2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     trn2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_trn2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c

new file mode 100644 (file)

index 0000000..e2f2692
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c
@@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp1_h:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_uzp1_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     uzp1    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     uzp1    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_uzp1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_uzp1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp1    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_uzp1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_uzp1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
+}
+
+/*
+** qi_uzp1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp1    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_uzp1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_uzp1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_uzp1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_uzp1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_uzp1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_uzp1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_uzp1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_uzp1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_uzp1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_uzp1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_uzp1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_uzp1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_uzp1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** si_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_uzp1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
+
+/*
+** sf_uzp1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_uzp1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c

new file mode 100644 (file)

index 0000000..0d8eda5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c
@@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp2_h:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_uzp2_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     uzp2    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     uzp2    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_uzp2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_uzp2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp2    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_uzp2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_uzp2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_uzp2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp2    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_uzp2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_uzp2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_uzp2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_uzp2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_uzp2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_uzp2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     uzp2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     uzp2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_uzp2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_uzp2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_uzp2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_uzp2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_uzp2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_uzp2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     uzp2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_uzp2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** si_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_uzp2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_uzp2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     uzp2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_uzp2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c

new file mode 100644 (file)

index 0000000..395b96f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip1_h_a:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_zip1_h_b:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_h_c:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_zip1_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip1    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     zip1    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_zip1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_zip1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip1    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_zip1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_zip1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_zip1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip1    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_zip1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_zip1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_zip1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_zip1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_zip1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_zip1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip1    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip1    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_zip1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_zip1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_zip1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_zip1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_zip1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_zip1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip1    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_zip1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_zip1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_zip1_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip1    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_zip1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c

new file mode 100644 (file)

index 0000000..9158ace
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip2_h_a:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_b:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_c:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.h, \2\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+**     ret
+*/
+v128qi
+qi_zip2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) });
+}
+
+/*
+** qi_zip2_h_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     zip2    \3\.h, \3\.h, \2\.h
+**     st1b    \3\.h, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.h, \1/z, \[x1\]
+**     zip2    \4\.h, \4\.h, \5\.h
+**     st1b    \4\.h, \1, \[x8\]
+** )
+**     ret
+*/
+v128qi
+qi_zip2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64qi
+qi_zip2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    \3\.s, \3\.s, \2\.s
+**     st1b    \3\.s, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip2    \4\.s, \4\.s, \5\.s
+**     st1b    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64qi
+qi_zip2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32qi
+qi_zip2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** qi_zip2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    \3\.d, \3\.d, \2\.d
+**     st1b    \3\.d, \1, \[x8\]
+** |
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1b    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip2    \4\.d, \4\.d, \5\.d
+**     st1b    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32qi
+qi_zip2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hi
+hi_zip2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hi
+hi_zip2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64hf
+hf_zip2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64hf
+hf_zip2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.s, \2\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+**     ret
+*/
+v64bf
+bf_zip2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     zip2    \3\.s, \3\.s, \2\.s
+**     st1h    \3\.s, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.s, \1/z, \[x1\]
+**     zip2    \4\.s, \4\.s, \5\.s
+**     st1h    \4\.s, \1, \[x8\]
+** )
+**     ret
+*/
+v64bf
+bf_zip2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hi
+hi_zip2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hi
+hi_zip2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32hf
+hf_zip2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32hf
+hf_zip2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32bf
+bf_zip2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d_two_op:
+**     ptrue   (p[0-7])\.b, vl256
+** (
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    \3\.d, \3\.d, \2\.d
+**     st1h    \3\.d, \1, \[x8\]
+** |
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x0\]
+**     ld1h    (z[0-9]+)\.d, \1/z, \[x1\]
+**     zip2    \4\.d, \4\.d, \5\.d
+**     st1h    \4\.d, \1, \[x8\]
+** )
+**     ret
+*/
+v32bf
+bf_zip2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** si_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32si
+si_zip2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}
+
+/*
+** sf_zip2_d:
+**     ptrue   (p[0-7])\.b, vl256
+**     ld1w    (z[0-9]+)\.d, \1/z, \[x0\]
+**     zip2    (z[0-9]+)\.d, \2\.d, \2\.d
+**     st1w    \3\.d, \1, \[x8\]
+**     ret
+*/
+v32sf
+sf_zip2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}
author	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)
committer	Richard Sandiford <richard.sandiford@arm.com>
	Fri, 6 Nov 2020 16:49:28 +0000 (16:49 +0000)
gcc/config/aarch64/aarch64-modes.def		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/iterators.md		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/ext_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/rev_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c	[new file with mode: 0644]	patch \| blob