+2018-01-02 Richard Sandiford <richard.sandiford@linaro.org>
+
+ * optabs.c (expand_vec_perm_var): Use an explicit encoding for
+ the broadcast of the low byte.
+ (expand_mult_highpart): Use an explicit encoding for the permutes.
+ * optabs-query.c (can_mult_highpart_p): Likewise.
+ * tree-vect-loop.c (calc_vec_perm_mask_for_shift): Likewise.
+ * tree-vect-stmts.c (perm_mask_for_reverse): Likewise.
+ (vectorizable_bswap): Likewise.
+ * tree-vect-data-refs.c (vect_grouped_store_supported): Use an
+ explicit encoding for the power-of-2 permutes.
+ (vect_permute_store_chain): Likewise.
+ (vect_grouped_load_supported): Likewise.
+ (vect_permute_load_chain): Likewise.
+
2018-01-02 Richard Sandiford <richard.sandiford@linaro.org>
* vec-perm-indices.h (vec_perm_indices_to_tree): Declare.
op = uns_p ? vec_widen_umult_odd_optab : vec_widen_smult_odd_optab;
if (optab_handler (op, mode) != CODE_FOR_nothing)
{
- vec_perm_builder sel (nunits, nunits, 1);
- for (i = 0; i < nunits; ++i)
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nunits, 2, 3);
+ for (i = 0; i < 6; ++i)
sel.quick_push (!BYTES_BIG_ENDIAN
+ (i & ~1)
+ ((i & 1) ? nunits : 0));
op = uns_p ? vec_widen_umult_lo_optab : vec_widen_smult_lo_optab;
if (optab_handler (op, mode) != CODE_FOR_nothing)
{
- vec_perm_builder sel (nunits, nunits, 1);
- for (i = 0; i < nunits; ++i)
+ /* The encoding has a single stepped pattern. */
+ vec_perm_builder sel (nunits, 1, 3);
+ for (int i = 0; i < 3; ++i)
sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
vec_perm_indices indices (sel, 2, nunits);
if (can_vec_perm_const_p (mode, indices))
NULL, 0, OPTAB_DIRECT);
gcc_assert (sel != NULL);
- /* Broadcast the low byte each element into each of its bytes. */
- vec_perm_builder const_sel (w, w, 1);
- for (i = 0; i < w; ++i)
- {
- int this_e = i / u * u;
- if (BYTES_BIG_ENDIAN)
- this_e += u - 1;
- const_sel.quick_push (this_e);
- }
+ /* Broadcast the low byte each element into each of its bytes.
+ The encoding has U interleaved stepped patterns, one for each
+ byte of an element. */
+ vec_perm_builder const_sel (w, u, 3);
+ unsigned int low_byte_in_u = BYTES_BIG_ENDIAN ? u - 1 : 0;
+ for (i = 0; i < 3; ++i)
+ for (unsigned int j = 0; j < u; ++j)
+ const_sel.quick_push (i * u + low_byte_in_u);
sel = gen_lowpart (qimode, sel);
sel = expand_vec_perm_const (qimode, sel, sel, const_sel, qimode, NULL);
gcc_assert (sel != NULL);
expand_insn (optab_handler (tab2, mode), 3, eops);
m2 = gen_lowpart (mode, eops[0].value);
- vec_perm_builder sel (nunits, nunits, 1);
+ vec_perm_builder sel;
if (method == 2)
{
- for (i = 0; i < nunits; ++i)
+ /* The encoding has 2 interleaved stepped patterns. */
+ sel.new_vector (nunits, 2, 3);
+ for (i = 0; i < 6; ++i)
sel.quick_push (!BYTES_BIG_ENDIAN + (i & ~1)
+ ((i & 1) ? nunits : 0));
}
else
{
- for (i = 0; i < nunits; ++i)
+ /* The encoding has a single interleaved stepped pattern. */
+ sel.new_vector (nunits, 1, 3);
+ for (i = 0; i < 3; ++i)
sel.quick_push (2 * i + (BYTES_BIG_ENDIAN ? 0 : 1));
}
if (VECTOR_MODE_P (mode))
{
unsigned int i, nelt = GET_MODE_NUNITS (mode);
- vec_perm_builder sel (nelt, nelt, 1);
- sel.quick_grow (nelt);
-
if (count == 3)
{
unsigned int j0 = 0, j1 = 0, j2 = 0;
unsigned int i, j;
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
vec_perm_indices indices;
for (j = 0; j < 3; j++)
{
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (pow2p_hwi (count));
- for (i = 0; i < nelt / 2; i++)
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nelt, 2, 3);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
{
sel[i * 2] = i;
sel[i * 2 + 1] = i + nelt;
vec_perm_indices indices (sel, 2, nelt);
if (can_vec_perm_const_p (mode, indices))
{
- for (i = 0; i < nelt; i++)
+ for (i = 0; i < 6; i++)
sel[i] += nelt / 2;
indices.new_vector (sel, 2, nelt);
if (can_vec_perm_const_p (mode, indices))
unsigned int i, n, log_length = exact_log2 (length);
unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
- vec_perm_builder sel (nelt, nelt, 1);
- sel.quick_grow (nelt);
-
result_chain->quick_grow (length);
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
{
unsigned int j0 = 0, j1 = 0, j2 = 0;
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
vec_perm_indices indices;
for (j = 0; j < 3; j++)
{
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (pow2p_hwi (length));
- for (i = 0, n = nelt / 2; i < n; i++)
+ /* The encoding has 2 interleaved stepped patterns. */
+ vec_perm_builder sel (nelt, 2, 3);
+ sel.quick_grow (6);
+ for (i = 0; i < 3; i++)
{
sel[i * 2] = i;
sel[i * 2 + 1] = i + nelt;
vec_perm_indices indices (sel, 2, nelt);
perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
- for (i = 0; i < nelt; i++)
+ for (i = 0; i < 6; i++)
sel[i] += nelt / 2;
indices.new_vector (sel, 2, nelt);
perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
if (VECTOR_MODE_P (mode))
{
unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
- vec_perm_builder sel (nelt, nelt, 1);
- sel.quick_grow (nelt);
if (count == 3)
{
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
vec_perm_indices indices;
unsigned int k;
for (k = 0; k < 3; k++)
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (pow2p_hwi (count));
- for (i = 0; i < nelt; i++)
+ /* The encoding has a single stepped pattern. */
+ vec_perm_builder sel (nelt, 1, 3);
+ sel.quick_grow (3);
+ for (i = 0; i < 3; i++)
sel[i] = i * 2;
vec_perm_indices indices (sel, 2, nelt);
if (can_vec_perm_const_p (mode, indices))
{
- for (i = 0; i < nelt; i++)
+ for (i = 0; i < 3; i++)
sel[i] = i * 2 + 1;
indices.new_vector (sel, 2, nelt);
if (can_vec_perm_const_p (mode, indices))
unsigned int i, j, log_length = exact_log2 (length);
unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
- vec_perm_builder sel (nelt, nelt, 1);
- sel.quick_grow (nelt);
-
result_chain->quick_grow (length);
memcpy (result_chain->address (), dr_chain.address (),
length * sizeof (tree));
{
unsigned int k;
+ vec_perm_builder sel (nelt, nelt, 1);
+ sel.quick_grow (nelt);
vec_perm_indices indices;
for (k = 0; k < 3; k++)
{
/* If length is not equal to 3 then only power of 2 is supported. */
gcc_assert (pow2p_hwi (length));
- for (i = 0; i < nelt; ++i)
+ /* The encoding has a single stepped pattern. */
+ vec_perm_builder sel (nelt, 1, 3);
+ sel.quick_grow (3);
+ for (i = 0; i < 3; ++i)
sel[i] = i * 2;
vec_perm_indices indices (sel, 2, nelt);
perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
- for (i = 0; i < nelt; ++i)
+ for (i = 0; i < 3; ++i)
sel[i] = i * 2 + 1;
indices.new_vector (sel, 2, nelt);
perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
vec_perm_builder *sel)
{
- sel->new_vector (nelt, nelt, 1);
- for (unsigned int i = 0; i < nelt; i++)
+ /* The encoding is a single stepped pattern. Any wrap-around is handled
+ by vec_perm_indices. */
+ sel->new_vector (nelt, 1, 3);
+ for (unsigned int i = 0; i < 3; i++)
sel->quick_push (i + offset);
}
nunits = TYPE_VECTOR_SUBPARTS (vectype);
- vec_perm_builder sel (nunits, nunits, 1);
- for (i = 0; i < nunits; ++i)
+ /* The encoding has a single stepped pattern. */
+ vec_perm_builder sel (nunits, 1, 3);
+ for (i = 0; i < 3; ++i)
sel.quick_push (nunits - 1 - i);
vec_perm_indices indices (sel, 1, nunits);
unsigned int num_bytes = TYPE_VECTOR_SUBPARTS (char_vectype);
unsigned word_bytes = num_bytes / nunits;
- vec_perm_builder elts (num_bytes, num_bytes, 1);
- for (unsigned i = 0; i < nunits; ++i)
+ /* The encoding uses one stepped pattern for each byte in the word. */
+ vec_perm_builder elts (num_bytes, word_bytes, 3);
+ for (unsigned i = 0; i < 3; ++i)
for (unsigned j = 0; j < word_bytes; ++j)
elts.quick_push ((i + 1) * word_bytes - j - 1);