return true;
}
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+ with two "and" and "pack" or two "shift" and "pack" insns. We should
+ have already failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+ rtx op, dop0, dop1, t, rperm[16];
+ unsigned i, odd, c, s, nelt = d->nelt;
+ bool end_perm = false;
+ machine_mode half_mode;
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_pack) (rtx, rtx, rtx);
+ rtx (*gen_shift) (rtx, rtx, rtx);
+
+ if (d->one_operand_p)
+ return false;
+
+ switch (d->vmode)
+ {
+ case V8HImode:
+ /* Required for "pack". */
+ if (!TARGET_SSE4_1)
+ return false;
+ c = 0xffff;
+ s = 16;
+ half_mode = V4SImode;
+ gen_and = gen_andv4si3;
+ gen_pack = gen_sse4_1_packusdw;
+ gen_shift = gen_lshrv4si3;
+ break;
+ case V16QImode:
+ /* No check as all instructions are SSE2. */
+ c = 0xff;
+ s = 8;
+ half_mode = V8HImode;
+ gen_and = gen_andv8hi3;
+ gen_pack = gen_sse2_packuswb;
+ gen_shift = gen_lshrv8hi3;
+ break;
+ case V16HImode:
+ if (!TARGET_AVX2)
+ return false;
+ c = 0xffff;
+ s = 16;
+ half_mode = V8SImode;
+ gen_and = gen_andv8si3;
+ gen_pack = gen_avx2_packusdw;
+ gen_shift = gen_lshrv8si3;
+ end_perm = true;
+ break;
+ case V32QImode:
+ if (!TARGET_AVX2)
+ return false;
+ c = 0xff;
+ s = 8;
+ half_mode = V16HImode;
+ gen_and = gen_andv16hi3;
+ gen_pack = gen_avx2_packuswb;
+ gen_shift = gen_lshrv16hi3;
+ end_perm = true;
+ break;
+ default:
+ /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+ general shuffles. */
+ return false;
+ }
+
+ /* Check that permutation is even or odd. */
+ odd = d->perm[0];
+ if (odd > 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ dop0 = gen_reg_rtx (half_mode);
+ dop1 = gen_reg_rtx (half_mode);
+ if (odd == 0)
+ {
+ for (i = 0; i < nelt / 2; i++)
+ rperm[i] = GEN_INT (c);
+ t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+ }
+ else
+ {
+ emit_insn (gen_shift (dop0,
+ gen_lowpart (half_mode, d->op0),
+ GEN_INT (s)));
+ emit_insn (gen_shift (dop1,
+ gen_lowpart (half_mode, d->op1),
+ GEN_INT (s)));
+ }
+ /* In AVX2 for 256 bit case we need to permute pack result. */
+ if (TARGET_AVX2 && end_perm)
+ {
+ op = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (V4DImode);
+ emit_insn (gen_pack (op, dop0, dop1));
+ emit_insn (gen_avx2_permv4di_1 (t,
+ gen_lowpart (V4DImode, op),
+ const0_rtx,
+ const2_rtx,
+ const1_rtx,
+ GEN_INT (3)));
+ emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+ }
+ else
+ emit_insn (gen_pack (d->target, dop0, dop1));
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
and extract-odd permutations. */
gcc_unreachable ();
case V8HImode:
- if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ if (TARGET_SSE4_1)
+ return expand_vec_perm_even_odd_pack (d);
+ else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
else
{
break;
case V16QImode:
- if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
- return expand_vec_perm_pshufb2 (d);
- else
- {
- if (d->testing_p)
- break;
- t1 = gen_reg_rtx (V16QImode);
- t2 = gen_reg_rtx (V16QImode);
- t3 = gen_reg_rtx (V16QImode);
- emit_insn (gen_vec_interleave_highv16qi (t1, d->op0, d->op1));
- emit_insn (gen_vec_interleave_lowv16qi (d->target, d->op0, d->op1));
- emit_insn (gen_vec_interleave_highv16qi (t2, d->target, t1));
- emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t1));
- emit_insn (gen_vec_interleave_highv16qi (t3, d->target, t2));
- emit_insn (gen_vec_interleave_lowv16qi (d->target, d->target, t2));
- if (odd)
- t3 = gen_vec_interleave_highv16qi (d->target, d->target, t3);
- else
- t3 = gen_vec_interleave_lowv16qi (d->target, d->target, t3);
- emit_insn (t3);
- }
- break;
+ return expand_vec_perm_even_odd_pack (d);
case V16HImode:
case V32QImode:
- return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+ return expand_vec_perm_even_odd_pack (d);
case V4DImode:
if (!TARGET_AVX2)
/* Try sequences of three instructions. */
+ if (expand_vec_perm_even_odd_pack (d))
+ return true;
+
if (expand_vec_perm_2vperm2f128_vshuf (d))
return true;