ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2BF, aarch64_sve_vg);
ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2);
ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
ADJUST_ALIGNMENT (VNx2HI, 2);
ADJUST_ALIGNMENT (VNx4HI, 2);
ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx2BF, 2);
ADJUST_ALIGNMENT (VNx4HF, 2);
+ADJUST_ALIGNMENT (VNx4BF, 2);
ADJUST_ALIGNMENT (VNx2SI, 4);
ADJUST_ALIGNMENT (VNx2SF, 4);
"<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
)
+;; Another way of expressing the REVB, REVH and REVW patterns, with this
+;; form being easier for permutes. The predicate mode determines the number
+;; of lanes and the data mode decides the granularity of the reversal within
+;; each lane.
+(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>"
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL
+ [(match_operand:PRED_HSD 1 "register_operand" "Upl")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 2 "register_operand" "w")]
+ UNSPEC_REVBHW)]
+ UNSPEC_PRED_X))]
+ "TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>"
+ "rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>"
+)
+
;; Predicated integer unary operations with merging.
(define_insn "@cond_<optab><mode>"
[(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
;; Duplicate one element of a vector.
(define_insn "@aarch64_sve_dup_lane<mode>"
- [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
- (vec_duplicate:SVE_FULL
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (vec_duplicate:SVE_ALL
(vec_select:<VEL>
- (match_operand:SVE_FULL 1 "register_operand" "w")
+ (match_operand:SVE_ALL 1 "register_operand" "w")
(parallel [(match_operand:SI 2 "const_int_operand")]))))]
"TARGET_SVE
- && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
- "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+ && IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)"
+ "dup\t%0.<Vctype>, %1.<Vctype>[%2]"
)
;; Use DUP.Q to duplicate a 128-bit segment of a register.
;; Reverse the order of elements within a full vector.
(define_insn "@aarch64_sve_rev<mode>"
- [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
- (unspec:SVE_FULL
- [(match_operand:SVE_FULL 1 "register_operand" "w")]
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 1 "register_operand" "w")]
UNSPEC_REV))]
"TARGET_SVE"
- "rev\t%0.<Vetype>, %1.<Vetype>")
+ "rev\t%0.<Vctype>, %1.<Vctype>")
;; -------------------------------------------------------------------------
;; ---- [INT,FP] Special-purpose binary permutes
;; -------------------------------------------------------------------------
;; Includes:
+;; - EXT
;; - SPLICE
;; - TRN1
;; - TRN2
;; Permutes that take half the elements from one vector and half the
;; elements from the other.
(define_insn "@aarch64_sve_<perm_insn><mode>"
- [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
- (unspec:SVE_FULL
- [(match_operand:SVE_FULL 1 "register_operand" "w")
- (match_operand:SVE_FULL 2 "register_operand" "w")]
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 1 "register_operand" "w")
+ (match_operand:SVE_ALL 2 "register_operand" "w")]
PERMUTE))]
"TARGET_SVE"
- "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+ "<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
)
;; Apply PERMUTE to 128-bit sequences. The behavior of these patterns
;; Concatenate two vectors and extract a subvector. Note that the
;; immediate (third) operand is the lane index not the byte index.
(define_insn "@aarch64_sve_ext<mode>"
- [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
- (unspec:SVE_FULL
- [(match_operand:SVE_FULL 1 "register_operand" "0, w")
- (match_operand:SVE_FULL 2 "register_operand" "w, w")
+ [(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w")
+ (unspec:SVE_ALL
+ [(match_operand:SVE_ALL 1 "register_operand" "0, w")
+ (match_operand:SVE_ALL 2 "register_operand" "w, w")
(match_operand:SI 3 "const_int_operand")]
UNSPEC_EXT))]
"TARGET_SVE
- && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+ && IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)"
{
- operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+ operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8);
return (which_alternative == 0
? "ext\\t%0.b, %0.b, %2.b, #%3"
: "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
/* Partial SVE HF vectors. */
case E_VNx2HFmode:
case E_VNx4HFmode:
+ /* Partial SVE BF vectors. */
+ case E_VNx2BFmode:
+ case E_VNx4BFmode:
/* Partial SVE SF vector. */
case E_VNx2SFmode:
return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
|| !diff)
return false;
- size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
- if (size == 8)
+ if (d->vec_flags & VEC_SVE_DATA)
+ size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
+ else
+ size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
+ if (size == 64)
{
unspec = UNSPEC_REV64;
pred_mode = VNx2BImode;
}
- else if (size == 4)
+ else if (size == 32)
{
unspec = UNSPEC_REV32;
pred_mode = VNx4BImode;
}
- else if (size == 2)
+ else if (size == 16)
{
unspec = UNSPEC_REV16;
pred_mode = VNx8BImode;
if (d->testing_p)
return true;
- if (d->vec_flags == VEC_SVE_DATA)
- {
- machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
- rtx target = gen_reg_rtx (int_mode);
- if (BYTES_BIG_ENDIAN)
- /* The act of taking a subreg between INT_MODE and d->vmode
- is itself a reversing operation on big-endian targets;
- see the comment at the head of aarch64-sve.md for details.
- First reinterpret OP0 as INT_MODE without using a subreg
- and without changing the contents. */
- emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
- else
- {
- /* For SVE we use REV[BHW] unspecs derived from the element size
- of v->mode and vector modes whose elements have SIZE bytes.
- This ensures that the vector modes match the predicate modes. */
- int unspec = aarch64_sve_rev_unspec (d->vmode);
- rtx pred = aarch64_ptrue_reg (pred_mode);
- emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
- gen_lowpart (int_mode, d->op0)));
- }
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+ if (d->vec_flags & VEC_SVE_DATA)
+ {
+ rtx pred = aarch64_ptrue_reg (pred_mode);
+ emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
+ d->target, pred, d->op0));
return true;
}
rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
|| !d->perm[0].is_constant (&elt))
return false;
- if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
+ if ((d->vec_flags & VEC_SVE_DATA)
+ && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
return false;
/* Success! */
if ((d->vec_flags == VEC_ADVSIMD
|| d->vec_flags == VEC_SVE_DATA
+ || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
|| d->vec_flags == VEC_SVE_PRED)
&& known_gt (nelt, 1))
{
(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
VNx8HI VNx4HI VNx2HI
VNx8HF VNx4HF VNx2HF
- VNx8BF
+ VNx8BF VNx4BF VNx2BF
VNx4SI VNx2SI
VNx4SF VNx2SF
VNx2DI
VNx2DI])
;; SVE modes with 2 or 4 elements.
-(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
- VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF
+ VNx2DI VNx2DF
+ VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
;; SVE modes with 2 elements.
-(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
+ VNx2SI VNx2SF VNx2DI VNx2DF])
;; SVE integer modes with 2 elements, excluding the widest element.
(define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
(define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI])
;; SVE modes with 4 elements.
-(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
;; SVE integer modes with 4 elements, excluding the widest element.
(define_mode_iterator SVE_4BHI [VNx4QI VNx4HI])
UNSPEC_REVB ; Used in aarch64-sve.md.
UNSPEC_REVH ; Used in aarch64-sve.md.
UNSPEC_REVW ; Used in aarch64-sve.md.
+ UNSPEC_REVBHW ; Used in aarch64-sve.md.
UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
UNSPEC_FMLA ; Used in aarch64-sve.md.
(VNx4SI "32") (VNx2DI "64")
(VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
+;; The number of bits in a vector container.
+(define_mode_attr container_bits [(VNx16QI "8")
+ (VNx8HI "16") (VNx8QI "16") (VNx8HF "16")
+ (VNx8BF "16")
+ (VNx4SI "32") (VNx4HI "32") (VNx4QI "32")
+ (VNx4SF "32") (VNx4HF "32") (VNx4BF "32")
+ (VNx2DI "64") (VNx2SI "64") (VNx2HI "64")
+ (VNx2QI "64") (VNx2DF "64") (VNx2SF "64")
+ (VNx2HF "64") (VNx2BF "64")])
+
;; Attribute to describe constants acceptable in logical operations
(define_mode_attr lconst [(SI "K") (DI "L")])
(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
- (VNx8BF "h")
+ (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
(VNx4SI "s") (VNx2SI "s")
(VNx4SF "s") (VNx2SF "s")
(VNx2DI "d")
(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
- (VNx8BF "h")
+ (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "d")
(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
(VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
(VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
- (VNx8BF "h")
+ (VNx8BF "h") (VNx4BF "s") (VNx2BF "d")
(VNx4SI "s") (VNx2SI "d")
(VNx4SF "s") (VNx2SF "d")
(VNx2DI "d")
(VNx2DF "d")])
+;; The instruction mnemonic suffix for an SVE mode's element container,
+;; i.e. the Vewtype of full SVE modes that have the same number of elements.
+(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
+ (VNx8HI "h") (VNx4HI "w") (VNx2HI "d")
+ (VNx8HF "h") (VNx4HF "w") (VNx2HF "d")
+ (VNx8BF "h") (VNx4BF "w") (VNx2BF "d")
+ (VNx4SI "w") (VNx2SI "d")
+ (VNx4SF "w") (VNx2SF "d")
+ (VNx2DI "d")
+ (VNx2DF "d")])
+
;; Vetype is used everywhere in scheduling type and assembly output,
;; sometimes they are not the same, for example HF modes on some
;; instructions. stype is defined to represent scheduling type
(VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
(VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
(VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
- (VNx8BF "BF")
+ (VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF")
(VNx4SI "SI") (VNx2SI "SI")
(VNx4SF "SF") (VNx2SF "SF")
(VNx2DI "DI")
(VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
(VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
(VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
- (VNx8BF "bf")
+ (VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf")
(VNx4SI "si") (VNx2SI "si")
(VNx4SF "sf") (VNx2SF "sf")
(VNx2DI "di")
(VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
(VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
(VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
- (VNx8BF "w")
+ (VNx8BF "w") (VNx4BF "w") (VNx2BF "w")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "x")
(VNx2DI "VNx2DI")
(VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
(VNx2HF "VNx2DI")
+ (VNx8BF "VNx8HI") (VNx4BF "VNx4SI")
+ (VNx2BF "VNx2DI")
(VNx4SF "VNx4SI") (VNx2SF "VNx2DI")
(VNx2DF "VNx2DI")])
(VNx2DI "vnx2di")
(VNx8HF "vnx8hi") (VNx4HF "vnx4si")
(VNx2HF "vnx2di")
+ (VNx8BF "vnx8hi") (VNx4BF "vnx4si")
+ (VNx2BF "vnx2di")
(VNx4SF "vnx4si") (VNx2SF "vnx2di")
(VNx2DF "vnx2di")])
(VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
(VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
(VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
- (VNx8BF "VNx8BI")
+ (VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI")
(VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
(VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
(VNx2DI "VNx2BI")
(VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
(VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
(VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
- (VNx8BF "vnx8bi")
+ (VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi")
(VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
(VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
(VNx2DI "vnx2bi")
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+/*
+** qi_dup_h_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** dup (z[0-9]+)\.h, \2\.h\[1\]
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_dup_h_1 (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_dup_h_31:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** dup (z[0-9]+)\.h, \2\.h\[31\]
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_dup_h_31 (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
+}
+
+/*
+** qi_dup_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[1\]
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_dup_s_1 (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_dup_s_15:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[15\]
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_dup_s_15 (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
+}
+
+/*
+** qi_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_dup_d_1 (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_dup_d_7:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[7\]
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_dup_d_7 (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
+}
+
+/*
+** hi_dup_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[1\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_dup_s_1 (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_dup_s_15:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[15\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_dup_s_15 (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
+}
+
+/*
+** hf_dup_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[1\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_dup_s_1 (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_dup_s_11:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[11\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_dup_s_11 (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
+}
+
+/*
+** bf_dup_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[1\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_dup_s_1 (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_dup_s_13:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** dup (z[0-9]+)\.s, \2\.s\[13\]
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_dup_s_13 (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
+}
+
+/*
+** hi_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_dup_d_1 (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_dup_d_7:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[7\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_dup_d_7 (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** hf_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_dup_d_1 (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_dup_d_5:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[5\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_dup_d_5 (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
+}
+
+/*
+** bf_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_dup_d_1 (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_dup_d_6:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[6\]
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_dup_d_6 (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
+}
+
+/*
+** si_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_dup_d_1 (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_dup_d_7:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[7\]
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_dup_d_7 (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
+
+/*
+** sf_dup_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[1\]
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_dup_d_1 (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_dup_d_7:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** dup (z[0-9]+)\.d, \2\.d\[7\]
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_dup_d_7 (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+v128qi
+qi_dup_h_32 (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) });
+}
+
+v64qi
+qi_dup_s_16 (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) });
+}
+
+v32qi
+qi_dup_d_8 (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) });
+}
+
+v64hi
+hi_dup_s_16 (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64hf
+hf_dup_s_16 (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64bf
+bf_dup_s_16 (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v32hi
+hi_dup_d_8 (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32hf
+hf_dup_d_8 (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32bf
+bf_dup_d_8 (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32si
+si_dup_d_8 (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+v32sf
+sf_dup_d_8 (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+/* { dg-final { scan-assembler-not {\tdup\tz} } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 1
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_ext_h_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #2
+** st1b \2\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_ext_h_1 (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_1_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ext \3\.b, \3\.b, \2\.b, #2
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ext \4\.b, \4\.b, \5\.b, #2
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_ext_h_1_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_127:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #254
+** st1b \2\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_ext_h_127 (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_ext_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #4
+** st1b \2\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_ext_s_1 (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_ext_s_63:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #252
+** st1b \2\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_ext_s_63 (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1b \2\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_ext_d_1 (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_ext_d_31:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #248
+** st1b \2\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_ext_d_31 (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_ext_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #4
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_ext_s_1 (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_ext_s_63:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #252
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_ext_s_63 (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_ext_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #4
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_ext_s_1 (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_ext_s_60:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #240
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_ext_s_60 (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) });
+}
+
+/*
+** bf_ext_s_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #4
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_ext_s_1 (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_ext_s_40:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #160
+** st1h \2\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_ext_s_40 (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) });
+}
+
+/*
+** hi_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_ext_d_1 (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_ext_d_31:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #248
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_ext_d_31 (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_ext_d_1 (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_ext_d_18:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #144
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_ext_d_18 (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) });
+}
+
+/*
+** bf_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_ext_d_1 (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_ext_d_7:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #56
+** st1h \2\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_ext_d_7 (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** si_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1w \2\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_ext_d_1 (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_ext_d_31:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #248
+** st1w \2\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_ext_d_31 (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_ext_d_1:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #8
+** st1w \2\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_ext_d_1 (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_ext_d_31:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** ext \2\.b, \2\.b, \2\.b, #248
+** st1w \2\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_ext_d_31 (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B - 1
+#define PERM1(B) PERM0 (B), PERM0 (B - 2)
+#define PERM2(B) PERM1 (B), PERM1 (B - 4)
+#define PERM3(B) PERM2 (B), PERM2 (B - 8)
+#define PERM4(B) PERM3 (B), PERM3 (B - 16)
+#define PERM5(B) PERM4 (B), PERM4 (B - 32)
+#define PERM6(B) PERM5 (B), PERM5 (B - 64)
+
+/*
+** qi_rev_h:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** rev (z[0-9]+)\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_rev_h (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_rev_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** rev (z[0-9]+)\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_rev_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_rev_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_rev_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** rev (z[0-9]+)\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_rev_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_rev_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** rev (z[0-9]+)\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_rev_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** bf_rev_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** rev (z[0-9]+)\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_rev_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hi_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_rev_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_rev_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** bf_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_rev_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** si_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_rev_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_rev_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** rev (z[0-9]+)\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_rev_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** revh (z[0-9]+)\.s, \1/m, \2\.s
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** revh (z[0-9]+)\.d, \1/m, \2\.d
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** revh (z[0-9]+)\.s, \1/m, \2\.s
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** revw (z[0-9]+)\.d, \1/m, \2\.d
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** revh (z[0-9]+)\.d, \1/m, \2\.d
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+f (short *restrict s, signed char *restrict c)
+{
+ for (int i = 0; i < 8; i += 2)
+ {
+ s[i] = c[i];
+ s[i + 1] = c[i];
+ }
+}
+
+/* Ideally this would use LD1SB, but currently we use LD1B and
+ sign-extend it after the permute. */
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
+/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn1_h_a:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn1_h_a (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_trn1_h_b:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn1_h_b (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_h_c:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn1_h_c (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_trn1_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn1 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** trn1 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_trn1_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_trn1_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** trn1 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_trn1_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_trn1_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_trn1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** trn1 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_trn1_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_trn1_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_trn1_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_trn1_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_trn1_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_trn1_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_trn1_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_trn1_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_trn1_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_trn1_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_trn1_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_trn1_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_trn1_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_trn1_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_trn1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** trn1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_trn1_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn2_h_a:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn2_h_a (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_b:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn2_h_b (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_h_c:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_trn2_h_c (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** trn2 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** trn2 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_trn2_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_trn2_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** trn2 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_trn2_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_trn2_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** qi_trn2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** trn2 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_trn2_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_trn2_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_trn2_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_trn2_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_trn2_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_trn2_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** trn2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** trn2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_trn2_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_trn2_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_trn2_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_trn2_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_trn2_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_trn2_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** trn2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_trn2_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** si_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_trn2_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
+
+/*
+** sf_trn2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** trn2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_trn2_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp1_h:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_uzp1_h (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** uzp1 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** uzp1 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_uzp1_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_uzp1_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp1 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_uzp1_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_uzp1_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
+}
+
+/*
+** qi_uzp1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp1 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_uzp1_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_uzp1_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_uzp1_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_uzp1_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_uzp1_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_uzp1_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_uzp1_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_uzp1_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_uzp1_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_uzp1_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_uzp1_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_uzp1_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_uzp1_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** si_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_uzp1_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
+
+/*
+** sf_uzp1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_uzp1_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp2_h:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_uzp2_h (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** uzp2 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** uzp2 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_uzp2_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_uzp2_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp2 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_uzp2_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_uzp2_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_uzp2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp2 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_uzp2_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_uzp2_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_uzp2_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_uzp2_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_uzp2_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_uzp2_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** uzp2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** uzp2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_uzp2_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_uzp2_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_uzp2_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_uzp2_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_uzp2_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_uzp2_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** uzp2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_uzp2_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** si_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_uzp2_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_uzp2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_uzp2_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip1_h_a:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip1_h_a (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_zip1_h_b:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip1_h_b (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_h_c:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip1_h_c (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_zip1_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip1 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** zip1 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_zip1_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_zip1_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** zip1 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_zip1_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_zip1_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_zip1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** zip1 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_zip1_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_zip1_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_zip1_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_zip1_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_zip1_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_zip1_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip1 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip1 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_zip1_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_zip1_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_zip1_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_zip1_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_zip1_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_zip1_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip1 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_zip1_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_zip1_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_zip1_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** zip1 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_zip1_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
--- /dev/null
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip2_h_a:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip2_h_a (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_b:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip2_h_b (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_c:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.h, \2\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** ret
+*/
+v128qi
+qi_zip2_h_c (v128qi x)
+{
+ return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) });
+}
+
+/*
+** qi_zip2_h_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** zip2 \3\.h, \3\.h, \2\.h
+** st1b \3\.h, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.h, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.h, \1/z, \[x1\]
+** zip2 \4\.h, \4\.h, \5\.h
+** st1b \4\.h, \1, \[x8\]
+** )
+** ret
+*/
+v128qi
+qi_zip2_h_two_op (v128qi x, v128qi y)
+{
+ return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** ret
+*/
+v64qi
+qi_zip2_s (v64qi x)
+{
+ return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 \3\.s, \3\.s, \2\.s
+** st1b \3\.s, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.s, \1/z, \[x1\]
+** zip2 \4\.s, \4\.s, \5\.s
+** st1b \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64qi
+qi_zip2_s_two_op (v64qi x, v64qi y)
+{
+ return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** ret
+*/
+v32qi
+qi_zip2_d (v32qi x)
+{
+ return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** qi_zip2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 \3\.d, \3\.d, \2\.d
+** st1b \3\.d, \1, \[x8\]
+** |
+** ld1b (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1b (z[0-9]+)\.d, \1/z, \[x1\]
+** zip2 \4\.d, \4\.d, \5\.d
+** st1b \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32qi
+qi_zip2_d_two_op (v32qi x, v32qi y)
+{
+ return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hi
+hi_zip2_s (v64hi x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hi
+hi_zip2_s_two_op (v64hi x, v64hi y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64hf
+hf_zip2_s (v64hf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64hf
+hf_zip2_s_two_op (v64hf x, v64hf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.s, \2\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** ret
+*/
+v64bf
+bf_zip2_s (v64bf x)
+{
+ return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** zip2 \3\.s, \3\.s, \2\.s
+** st1h \3\.s, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.s, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.s, \1/z, \[x1\]
+** zip2 \4\.s, \4\.s, \5\.s
+** st1h \4\.s, \1, \[x8\]
+** )
+** ret
+*/
+v64bf
+bf_zip2_s_two_op (v64bf x, v64bf y)
+{
+ return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hi
+hi_zip2_d (v32hi x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hi
+hi_zip2_d_two_op (v32hi x, v32hi y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32hf
+hf_zip2_d (v32hf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32hf
+hf_zip2_d_two_op (v32hf x, v32hf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** ret
+*/
+v32bf
+bf_zip2_d (v32bf x)
+{
+ return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d_two_op:
+** ptrue (p[0-7])\.b, vl256
+** (
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 \3\.d, \3\.d, \2\.d
+** st1h \3\.d, \1, \[x8\]
+** |
+** ld1h (z[0-9]+)\.d, \1/z, \[x0\]
+** ld1h (z[0-9]+)\.d, \1/z, \[x1\]
+** zip2 \4\.d, \4\.d, \5\.d
+** st1h \4\.d, \1, \[x8\]
+** )
+** ret
+*/
+v32bf
+bf_zip2_d_two_op (v32bf x, v32bf y)
+{
+ return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** si_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32si
+si_zip2_d (v32si x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}
+
+/*
+** sf_zip2_d:
+** ptrue (p[0-7])\.b, vl256
+** ld1w (z[0-9]+)\.d, \1/z, \[x0\]
+** zip2 (z[0-9]+)\.d, \2\.d, \2\.d
+** st1w \3\.d, \1, \[x8\]
+** ret
+*/
+v32sf
+sf_zip2_d (v32sf x)
+{
+ return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}