d.vmode = vmode;
d.vec_flags = aarch64_classify_vector_mode (d.vmode);
d.target = target;
- d.op0 = op0;
- d.op1 = op1;
+ d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
+ if (op0 == op1)
+ d.op1 = d.op0;
+ else
+ d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
d.testing_p = !target;
if (!d.testing_p)
return false;
d.target = target;
+ if (op0)
+ {
+ rtx nop0 = force_reg (vmode, op0);
+ if (op0 == op1)
+ op1 = nop0;
+ op0 = nop0;
+ }
+ if (op1)
+ op1 = force_reg (vmode, op1);
d.op0 = op0;
d.op1 = op1;
for (unsigned int i = 0; i < nelt; ++i)
perm[i] = sel[i] & (2 * nelt - 1);
+ src0 = force_reg (vmode, src0);
+ src1 = force_reg (vmode, src1);
+
/* Make life a bit easier by swapping operands if necessary so that
the first element always comes from src0. */
if (perm[0] >= nelt)
{
- rtx temp = src0;
- src0 = src1;
- src1 = temp;
+ std::swap (src0, src1);
for (unsigned int i = 0; i < nelt; ++i)
if (perm[i] < nelt)
two_args = canonicalize_perm (&d);
+ /* If one of the operands is a zero vector, try to match pmovzx. */
+ if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+ {
+ struct expand_vec_perm_d dzero = d;
+ if (d.op0 == CONST0_RTX (vmode))
+ {
+ d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+ std::swap (dzero.op0, dzero.op1);
+ for (i = 0; i < nelt; ++i)
+ dzero.perm[i] ^= nelt;
+ }
+ else
+ d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+ if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+ dzero.perm, nelt, dzero.testing_p))
+ return true;
+ }
+
+ /* Force operands into registers. */
+ rtx nop0 = force_reg (vmode, d.op0);
+ if (d.op0 == d.op1)
+ d.op1 = nop0;
+ d.op0 = nop0;
+ d.op1 = force_reg (vmode, d.op1);
+
if (ix86_expand_vec_perm_const_1 (&d))
return true;
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "OI")])
+(define_insn_and_split "*avx2_zero_extendv16qiv16hi2_1"
+ [(set (match_operand:V32QI 0 "register_operand" "=v")
+ (vec_select:V32QI
+ (vec_concat:V64QI
+ (match_operand:V32QI 1 "nonimmediate_operand" "vm")
+ (match_operand:V32QI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX2"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V16HI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V16HImode, operands[0], V32QImode);
+ operands[1] = lowpart_subreg (V16QImode, operands[1], V32QImode);
+})
+
(define_expand "<insn>v16qiv16hi2"
[(set (match_operand:V16HI 0 "register_operand")
(any_extend:V16HI
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
+(define_insn_and_split "*avx512bw_zero_extendv32qiv32hi2_1"
+ [(set (match_operand:V64QI 0 "register_operand" "=v")
+ (vec_select:V64QI
+ (vec_concat:V128QI
+ (match_operand:V64QI 1 "nonimmediate_operand" "vm")
+ (match_operand:V64QI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX512BW"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V32HI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V32HImode, operands[0], V64QImode);
+ operands[1] = lowpart_subreg (V32QImode, operands[1], V64QImode);
+})
+
(define_expand "<insn>v32qiv32hi2"
[(set (match_operand:V32HI 0 "register_operand")
(any_extend:V32HI
(match_operand:V16HI 1 "nonimmediate_operand")))]
"TARGET_AVX512F")
+(define_insn_and_split "avx512f_zero_extendv16hiv16si2_1"
+ [(set (match_operand:V32HI 0 "register_operand" "=v")
+ (vec_select:V32HI
+ (vec_concat:V64HI
+ (match_operand:V32HI 1 "nonimmediate_operand" "vm")
+ (match_operand:V32HI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX512F"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V16SI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V16SImode, operands[0], V32HImode);
+ operands[1] = lowpart_subreg (V16HImode, operands[1], V32HImode);
+})
+
(define_insn "avx2_<code>v8hiv8si2<mask_name>"
[(set (match_operand:V8SI 0 "register_operand" "=v")
(any_extend:V8SI
(match_operand:V8HI 1 "nonimmediate_operand")))]
"TARGET_AVX2")
+(define_insn_and_split "avx2_zero_extendv8hiv8si2_1"
+ [(set (match_operand:V16HI 0 "register_operand" "=v")
+ (vec_select:V16HI
+ (vec_concat:V32HI
+ (match_operand:V16HI 1 "nonimmediate_operand" "vm")
+ (match_operand:V16HI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX2"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V8SI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V8SImode, operands[0], V16HImode);
+ operands[1] = lowpart_subreg (V8HImode, operands[1], V16HImode);
+})
+
(define_insn "sse4_1_<code>v4hiv4si2<mask_name>"
[(set (match_operand:V4SI 0 "register_operand" "=Yr,*x,v")
(any_extend:V4SI
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
+(define_insn_and_split "*avx512f_zero_extendv8siv8di2_1"
+ [(set (match_operand:V16SI 0 "register_operand" "=v")
+ (vec_select:V16SI
+ (vec_concat:V32SI
+ (match_operand:V16SI 1 "nonimmediate_operand" "vm")
+ (match_operand:V16SI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX512F"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V8DI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V8DImode, operands[0], V16SImode);
+ operands[1] = lowpart_subreg (V8SImode, operands[1], V16SImode);
+})
+
(define_expand "<insn>v8siv8di2"
[(set (match_operand:V8DI 0 "register_operand" "=v")
(any_extend:V8DI
(set_attr "prefix_extra" "1")
(set_attr "mode" "OI")])
+(define_insn_and_split "*avx2_zero_extendv4siv4di2_1"
+ [(set (match_operand:V8SI 0 "register_operand" "=v")
+ (vec_select:V8SI
+ (vec_concat:V16SI
+ (match_operand:V8SI 1 "nonimmediate_operand" "vm")
+ (match_operand:V8SI 2 "const0_operand" "C"))
+ (match_parallel 3 "pmovzx_parallel"
+ [(match_operand 4 "const_int_operand" "n")])))]
+ "TARGET_AVX2"
+ "#"
+ "&& reload_completed"
+ [(set (match_dup 0) (zero_extend:V4DI (match_dup 1)))]
+{
+ operands[0] = lowpart_subreg (V4DImode, operands[0], V8SImode);
+ operands[1] = lowpart_subreg (V4SImode, operands[1], V8SImode);
+})
+
(define_expand "<insn>v4siv4di2"
[(set (match_operand:V4DI 0 "register_operand" "=v")
(any_extend:V4DI
unsigned int i, nelt, which;
d.target = target;
+ if (op0)
+ {
+ rtx nop0 = force_reg (vmode, op0);
+ if (op0 == op1)
+ op1 = nop0;
+ op0 = nop0;
+ }
+ if (op1)
+ op1 = force_reg (vmode, op1);
d.op0 = op0;
d.op1 = op1;
bool ok;
d.target = target;
+ if (op0)
+ {
+ rtx nop0 = force_reg (vmode, op0);
+ if (op0 == op1)
+ op1 = nop0;
+ op0 = nop0;
+ }
+ if (op1)
+ op1 = force_reg (vmode, op1);
d.op0 = op0;
d.op1 = op1;
if (TARGET_ALTIVEC && testing_p)
return true;
+ if (op0)
+ {
+ rtx nop0 = force_reg (vmode, op0);
+ if (op0 == op1)
+ op1 = nop0;
+ op0 = nop0;
+ }
+ if (op1)
+ op1 = force_reg (vmode, op1);
+
/* Check for ps_merge* or xxpermdi insns. */
if ((vmode == V2DFmode || vmode == V2DImode) && VECTOR_MEM_VSX_P (vmode))
{
if (vmode != V8QImode)
return false;
+ rtx nop0 = force_reg (vmode, op0);
+ if (op0 == op1)
+ op1 = nop0;
+ op0 = nop0;
+ op1 = force_reg (vmode, op1);
+
unsigned int i, mask;
for (i = mask = 0; i < 8; ++i)
mask |= (sel[i] & 0xf) << (28 - i*4);
if (targetm.vectorize.vec_perm_const != NULL)
{
- v0 = force_reg (mode, v0);
if (single_arg_p)
v1 = v0;
- else
- v1 = force_reg (mode, v1);
if (targetm.vectorize.vec_perm_const (mode, target, v0, v1, indices))
return target;
return gen_lowpart (mode, target_qi);
}
+ v0 = force_reg (mode, v0);
+ if (single_arg_p)
+ v1 = v0;
+ v1 = force_reg (mode, v1);
+
/* Otherwise expand as a fully variable permuation. */
/* The optabs are only defined for selectors with the same width
/* PR target/95905 */
/* { dg-do compile } */
/* { dg-options "-O2 -msse4.1" } */
-/* { dg-final { scan-assembler "\tv?pmovzxbw\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxwd\t" } } */
-/* { dg-final { scan-assembler "\tv?pmovzxdq\t" } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tv?pmovzxdq\t" 4 } } */
typedef unsigned char V1 __attribute__((vector_size (16)));
typedef unsigned short V2 __attribute__((vector_size (16)));
{
return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 4, 1, 5 });
}
+
+V1
+f7 (V1 x)
+{
+ return __builtin_shuffle ((V1) {}, x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f8 (V2 x)
+{
+ return __builtin_shuffle ((V2) {}, x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f9 (V3 x)
+{
+ return __builtin_shuffle ((V3) {}, x, (V3) { 4, 0, 5, 1 });
+}
+
+V1
+f10 (V1 *x)
+{
+ return __builtin_shuffle ((V1) {}, *x, (V1) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V2
+f11 (V2 *x)
+{
+ return __builtin_shuffle ((V2) {}, *x, (V2) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V3
+f12 (V3 *x)
+{
+ return __builtin_shuffle ((V3) {}, *x, (V3) { 4, 0, 5, 1 });
+}
--- /dev/null
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (32)));
+typedef unsigned short V2 __attribute__((vector_size (32)));
+typedef unsigned int V3 __attribute__((vector_size (32)));
+
+V1
+f1 (V1 x)
+{
+ return __builtin_shuffle (x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V2
+f2 (V2 x)
+{
+ return __builtin_shuffle (x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f3 (V3 x)
+{
+ return __builtin_shuffle (x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f4 (V1 *x)
+{
+ return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V2
+f5 (V2 *x)
+{
+ return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V3
+f6 (V3 *x)
+{
+ return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 8, 1, 9, 2, 10, 3, 11 });
+}
+
+V1
+f7 (V1 x)
+{
+ return __builtin_shuffle ((V1) {}, x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V2
+f8 (V2 x)
+{
+ return __builtin_shuffle ((V2) {}, x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f9 (V3 x)
+{
+ return __builtin_shuffle ((V3) {}, x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
+
+V1
+f10 (V1 *x)
+{
+ return __builtin_shuffle ((V1) {}, *x, (V1) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V2
+f11 (V2 *x)
+{
+ return __builtin_shuffle ((V2) {}, *x, (V2) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V3
+f12 (V3 *x)
+{
+ return __builtin_shuffle ((V3) {}, *x, (V3) { 8, 0, 9, 1, 10, 2, 11, 3 });
+}
--- /dev/null
+/* PR target/95905 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "\tvpmovzxbw\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxwd\t" 4 } } */
+/* { dg-final { scan-assembler-times "\tvpmovzxdq\t" 4 } } */
+
+typedef unsigned char V1 __attribute__((vector_size (64)));
+typedef unsigned short V2 __attribute__((vector_size (64)));
+typedef unsigned int V3 __attribute__((vector_size (64)));
+
+V1
+f1 (V1 x)
+{
+ return __builtin_shuffle (x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f2 (V2 x)
+{
+ return __builtin_shuffle (x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V3
+f3 (V3 x)
+{
+ return __builtin_shuffle (x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f4 (V1 *x)
+{
+ return __builtin_shuffle (*x, (V1) {}, (V1) { 0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95 });
+}
+
+V2
+f5 (V2 *x)
+{
+ return __builtin_shuffle (*x, (V2) {}, (V2) { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47 });
+}
+
+V3
+f6 (V3 *x)
+{
+ return __builtin_shuffle (*x, (V3) {}, (V3) { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 });
+}
+
+V1
+f7 (V1 x)
+{
+ return __builtin_shuffle ((V1) {}, x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f8 (V2 x)
+{
+ return __builtin_shuffle ((V2) {}, x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V3
+f9 (V3 x)
+{
+ return __builtin_shuffle ((V3) {}, x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}
+
+V1
+f10 (V1 *x)
+{
+ return __builtin_shuffle ((V1) {}, *x, (V1) { 64, 0, 65, 1, 66, 2, 67, 3, 68, 4, 69, 5, 70, 6, 71, 7, 72, 8, 73, 9, 74, 10, 75, 11, 76, 12, 77, 13, 78, 14, 79, 15, 80, 16, 81, 17, 82, 18, 83, 19, 84, 20, 85, 21, 86, 22, 87, 23, 88, 24, 89, 25, 90, 26, 91, 27, 92, 28, 93, 29, 94, 30, 95, 31 });
+}
+
+V2
+f11 (V2 *x)
+{
+ return __builtin_shuffle ((V2) {}, *x, (V2) { 32, 0, 33, 1, 34, 2, 35, 3, 36, 4, 37, 5, 38, 6, 39, 7, 40, 8, 41, 9, 42, 10, 43, 11, 44, 12, 45, 13, 46, 14, 47, 15 });
+}
+
+V3
+f12 (V3 *x)
+{
+ return __builtin_shuffle ((V3) {}, *x, (V3) { 16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7 });
+}