return ok;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
using movss or movsd. */
static bool
expand_vec_perm_movs (struct expand_vec_perm_d *d)
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
static bool
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of the variable form of vpermilps.
Note that we will have already failed the immediate input vpermilps,
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
static bool
static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
in a single instruction. */
static bool
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
in terms of a pair of pshuflw + pshufhw instructions. */
static bool
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
the permutation using the SSSE3 palignr instruction. This succeeds
when all of the elements in PERM fit within one vector and we merely
need to shift them down so that a single vector permutation has a
static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation into a single vector permutation by using
an interleave operation to merge the vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a single vector cross-lane permutation into vpermq followed
by any of the single insn permutations. */
static bool canonicalize_perm (struct expand_vec_perm_d *d);
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
a vector permutation using two instructions, vperm2f128 resp.
vperm2i128 followed by any single in-lane permutation. */
return false;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
a two vector permutation using 2 intra-lane interleave insns
and cross-lane shuffle for 32-byte vectors. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
a single vector permutation using a single intra-lane vector
permutation, vperm2f128 swapping the lanes and vblend* insn blending
the non-swapped and swapped vectors together. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together. */
return true;
}
+static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
+ a two vector permutation using two intra-lane vector
+ permutations, vperm2f128 swapping the lanes and vblend* insn blending
+ the non-swapped and swapped vectors together. */
+
+static bool
+expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
+{
+ struct expand_vec_perm_d dfirst, dsecond, dthird;
+ unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
+ rtx_insn *seq1, *seq2;
+ bool ok;
+ rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
+
+ if (!TARGET_AVX
+ || TARGET_AVX2
+ || (d->vmode != V8SFmode && d->vmode != V4DFmode)
+ || d->one_operand_p)
+ return false;
+
+ dfirst = *d;
+ dsecond = *d;
+ for (i = 0; i < nelt; i++)
+ {
+ dfirst.perm[i] = 0xff;
+ dsecond.perm[i] = 0xff;
+ }
+ for (i = 0, msk = 0; i < nelt; i++)
+ {
+ j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
+ if (j == i)
+ {
+ dfirst.perm[j] = d->perm[i];
+ which1 |= (d->perm[i] < nelt ? 1 : 2);
+ }
+ else
+ {
+ dsecond.perm[j] = d->perm[i];
+ which2 |= (d->perm[i] < nelt ? 1 : 2);
+ msk |= (1U << i);
+ }
+ }
+ if (msk == 0 || msk == (1U << nelt) - 1)
+ return false;
+
+ if (!d->testing_p)
+ {
+ dfirst.target = gen_reg_rtx (dfirst.vmode);
+ dsecond.target = gen_reg_rtx (dsecond.vmode);
+ }
+
+ for (i = 0; i < nelt; i++)
+ {
+ if (dfirst.perm[i] == 0xff)
+ dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
+ if (dsecond.perm[i] == 0xff)
+ dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
+ }
+ canonicalize_perm (&dfirst);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dfirst);
+ seq1 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ canonicalize_perm (&dsecond);
+ start_sequence ();
+ ok = ix86_expand_vec_perm_const_1 (&dsecond);
+ seq2 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ emit_insn (seq1);
+ emit_insn (seq2);
+
+ dthird = *d;
+ dthird.op0 = dsecond.target;
+ dthird.op1 = dsecond.target;
+ dthird.one_operand_p = true;
+ dthird.target = gen_reg_rtx (dthird.vmode);
+ for (i = 0; i < nelt; i++)
+ dthird.perm[i] = i ^ nelt2;
+
+ ok = expand_vec_perm_1 (&dthird);
+ gcc_assert (ok);
+
+ blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
+ emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
+ return true;
+}
+
/* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
permutation with two pshufb insns and an ior. We should have already
failed all two instruction sequences. */
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
and extract-odd permutations. */
static bool
return true;
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
extract-even and extract-odd permutations. */
static bool
return expand_vec_perm_even_odd_1 (d, odd);
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast
+/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
permutations. We assume that expand_vec_perm_1 has already failed. */
static bool
}
}
-/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match
+/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
broadcast permutations. */
static bool
return true;
}
+ /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
+ if (expand_vec_perm2_vperm2f128_vblend (d))
+ return true;
+
return false;
}
int i, which, nelt = d->nelt;
for (i = which = 0; i < nelt; ++i)
- which |= (d->perm[i] < nelt ? 1 : 2);
+ which |= (d->perm[i] < nelt ? 1 : 2);
d->one_operand_p = true;
switch (which)
T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
-T (25, 0, 1, 2, 3, 12, 13, 14, 15)
+T (25, 0, 1, 2, 3, 12, 13, 14, 15) \
+T (26, 0, 1, 8, 9, 10, 11, 12, 13) \
+T (27, 0, 8, 9, 10, 11, 12, 13, 14)
#define EXPTESTS \
T (116, 9, 3, 9, 4, 7, 0, 0, 6) \
T (117, 4, 14, 12, 8, 9, 6, 0, 10) \