The SLP code currently punts for all variable-length permutes.
This patch makes it handle the easy case of N->N permutes in which
the number of vector lanes is a multiple of N. Every permute then
uses the same mask, and that mask repeats (with a stride) every
N elements.
The patch uses the same path for constant-length vectors,
since it should be slightly cheaper in terms of compile time.
2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
gcc/
* tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
the case in which the permute needs only a single element and
repeats for every vector of the result. Extend that case to
handle variable-length vectors.
* tree-vect-stmts.c (vectorizable_load): Update accordingly.
gcc/testsuite/
* gcc.target/aarch64/sve/slp_perm_1.c: New test.
* gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
* gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
From-SVN: r263832
+2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
+
+ * tree-vect-slp.c (vect_transform_slp_perm_load): Separate out
+ the case in which the permute needs only a single element and
+ repeats for every vector of the result. Extend that case to
+ handle variable-length vectors.
+ * tree-vect-stmts.c (vectorizable_load): Update accordingly.
+
2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342
+2018-08-24 Richard Sandiford <richard.sandiford@arm.com>
+
+ * gcc.target/aarch64/sve/slp_perm_1.c: New test.
+ * gcc.target/aarch64/sve/slp_perm_2.c: Likewise.
+ * gcc.target/aarch64/sve/slp_perm_3.c: Likewise.
+ * gcc.target/aarch64/sve/slp_perm_4.c: Likewise.
+ * gcc.target/aarch64/sve/slp_perm_5.c: Likewise.
+ * gcc.target/aarch64/sve/slp_perm_6.c: Likewise.
+ * gcc.target/aarch64/sve/slp_perm_7.c: Likewise.
+
2018-08-24 H.J. Lu <hongjiu.lu@intel.com>
PR debug/79342
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8 + 7] + 1;
+ a[i * 8 + 1] = b[i * 8 + 6] + 2;
+ a[i * 8 + 2] = b[i * 8 + 5] + 3;
+ a[i * 8 + 3] = b[i * 8 + 4] + 4;
+ a[i * 8 + 4] = b[i * 8 + 3] + 5;
+ a[i * 8 + 5] = b[i * 8 + 2] + 6;
+ a[i * 8 + 6] = b[i * 8 + 1] + 7;
+ a[i * 8 + 7] = b[i * 8 + 0] + 8;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 2] + 2;
+ a[i * 8 + 2] = b[i * 8 + 1] + 3;
+ a[i * 8 + 3] = b[i * 8 + 0] + 4;
+ a[i * 8 + 4] = b[i * 8 + 7] + 5;
+ a[i * 8 + 5] = b[i * 8 + 6] + 6;
+ a[i * 8 + 6] = b[i * 8 + 5] + 7;
+ a[i * 8 + 7] = b[i * 8 + 4] + 8;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8 + 1] + 1;
+ a[i * 8 + 1] = b[i * 8 + 0] + 2;
+ a[i * 8 + 2] = b[i * 8 + 3] + 3;
+ a[i * 8 + 3] = b[i * 8 + 2] + 4;
+ a[i * 8 + 4] = b[i * 8 + 5] + 5;
+ a[i * 8 + 5] = b[i * 8 + 4] + 6;
+ a[i * 8 + 6] = b[i * 8 + 7] + 7;
+ a[i * 8 + 7] = b[i * 8 + 6] + 8;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\trevb\tz[0-9]+\.h, p[0-7]/m, z[0-9]+\.h\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b, uint8_t *restrict c)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8] + c[i * 8];
+ a[i * 8 + 1] = b[i * 8] + c[i * 8 + 1];
+ a[i * 8 + 2] = b[i * 8 + 2] + c[i * 8 + 2];
+ a[i * 8 + 3] = b[i * 8 + 2] + c[i * 8 + 3];
+ a[i * 8 + 4] = b[i * 8 + 4] + c[i * 8 + 4];
+ a[i * 8 + 5] = b[i * 8 + 4] + c[i * 8 + 5];
+ a[i * 8 + 6] = b[i * 8 + 6] + c[i * 8 + 6];
+ a[i * 8 + 7] = b[i * 8 + 6] + c[i * 8 + 7];
+ }
+}
+
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b,
+ uint8_t *restrict c, uint8_t *restrict d)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = c[i * 8] + d[i * 8];
+ a[i * 8 + 1] = c[i * 8] + d[i * 8 + 1];
+ a[i * 8 + 2] = c[i * 8 + 2] + d[i * 8 + 2];
+ a[i * 8 + 3] = c[i * 8 + 2] + d[i * 8 + 3];
+ a[i * 8 + 4] = c[i * 8 + 4] + d[i * 8 + 4];
+ a[i * 8 + 5] = c[i * 8 + 4] + d[i * 8 + 5];
+ a[i * 8 + 6] = c[i * 8 + 6] + d[i * 8 + 6];
+ a[i * 8 + 7] = c[i * 8 + 6] + d[i * 8 + 7];
+ b[i * 8] = c[i * 8 + 1] + d[i * 8];
+ b[i * 8 + 1] = c[i * 8 + 1] + d[i * 8 + 1];
+ b[i * 8 + 2] = c[i * 8 + 3] + d[i * 8 + 2];
+ b[i * 8 + 3] = c[i * 8 + 3] + d[i * 8 + 3];
+ b[i * 8 + 4] = c[i * 8 + 5] + d[i * 8 + 4];
+ b[i * 8 + 5] = c[i * 8 + 5] + d[i * 8 + 5];
+ b[i * 8 + 6] = c[i * 8 + 7] + d[i * 8 + 6];
+ b[i * 8 + 7] = c[i * 8 + 7] + d[i * 8 + 7];
+ }
+}
+
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
+/* { dg-final { scan-assembler {\ttrn2\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8 + 3] + 1;
+ a[i * 8 + 1] = b[i * 8 + 6] + 1;
+ a[i * 8 + 2] = b[i * 8 + 0] + 1;
+ a[i * 8 + 3] = b[i * 8 + 2] + 1;
+ a[i * 8 + 4] = b[i * 8 + 1] + 1;
+ a[i * 8 + 5] = b[i * 8 + 7] + 1;
+ a[i * 8 + 6] = b[i * 8 + 5] + 1;
+ a[i * 8 + 7] = b[i * 8 + 4] + 1;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+void
+f (uint8_t *restrict a, uint8_t *restrict b)
+{
+ for (int i = 0; i < 100; ++i)
+ {
+ a[i * 8] = b[i * 8 + 1] + 1;
+ a[i * 8 + 1] = b[i * 8 + 7] + 2;
+ a[i * 8 + 2] = b[i * 8 + 1] + 3;
+ a[i * 8 + 3] = b[i * 8 + 7] + 4;
+ a[i * 8 + 4] = b[i * 8 + 1] + 5;
+ a[i * 8 + 5] = b[i * 8 + 7] + 6;
+ a[i * 8 + 6] = b[i * 8 + 1] + 7;
+ a[i * 8 + 7] = b[i * 8 + 7] + 8;
+ }
+}
+
+/* { dg-final { scan-assembler {\ttbl\tz[0-9]+\.b, z[0-9]+\.b, z[0-9]+\.b\n} } } */
{
stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
vec_info *vinfo = stmt_info->vinfo;
- tree mask_element_type = NULL_TREE, mask_type;
int vec_index = 0;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+ unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
unsigned int mask_element;
machine_mode mode;
- unsigned HOST_WIDE_INT nunits, const_vf;
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
return false;
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
mode = TYPE_MODE (vectype);
-
- /* At the moment, all permutations are represented using per-element
- indices, so we can't cope with variable vector lengths or
- vectorization factors. */
- if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
- || !vf.is_constant (&const_vf))
- return false;
-
- /* The generic VEC_PERM_EXPR code always uses an integral type of the
- same size as the vector element being permuted. */
- mask_element_type = lang_hooks.types.type_for_mode
- (int_mode_for_mode (TYPE_MODE (TREE_TYPE (vectype))).require (), 1);
- mask_type = get_vectype_for_scalar_type (mask_element_type);
- vec_perm_builder mask (nunits, nunits, 1);
- mask.quick_grow (nunits);
- vec_perm_indices indices;
+ poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
/* Initialize the vect stmts of NODE to properly insert the generated
stmts later. */
bool noop_p = true;
*n_perms = 0;
- for (unsigned int j = 0; j < const_vf; j++)
+ vec_perm_builder mask;
+ unsigned int nelts_to_build;
+ unsigned int nvectors_per_build;
+ bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
+ && multiple_p (nunits, group_size));
+ if (repeating_p)
{
- for (int k = 0; k < group_size; k++)
+ /* A single vector contains a whole number of copies of the node, so:
+ (a) all permutes can use the same mask; and
+ (b) the permutes only need a single vector input. */
+ mask.new_vector (nunits, group_size, 3);
+ nelts_to_build = mask.encoded_nelts ();
+ nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+ }
+ else
+ {
+ /* We need to construct a separate mask for each vector statement. */
+ unsigned HOST_WIDE_INT const_nunits, const_vf;
+ if (!nunits.is_constant (&const_nunits)
+ || !vf.is_constant (&const_vf))
+ return false;
+ mask.new_vector (const_nunits, const_nunits, 1);
+ nelts_to_build = const_vf * group_size;
+ nvectors_per_build = 1;
+ }
+
+ unsigned int count = mask.encoded_nelts ();
+ mask.quick_grow (count);
+ vec_perm_indices indices;
+
+ for (unsigned int j = 0; j < nelts_to_build; j++)
+ {
+ unsigned int iter_num = j / group_size;
+ unsigned int stmt_num = j % group_size;
+ unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
+ + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+ if (repeating_p)
{
- unsigned int i = (SLP_TREE_LOAD_PERMUTATION (node)[k]
- + j * DR_GROUP_SIZE (stmt_info));
- vec_index = i / nunits;
- mask_element = i % nunits;
+ first_vec_index = 0;
+ mask_element = i;
+ }
+ else
+ {
+ /* Enforced before the loop when !repeating_p. */
+ unsigned int const_nunits = nunits.to_constant ();
+ vec_index = i / const_nunits;
+ mask_element = i % const_nunits;
if (vec_index == first_vec_index
|| first_vec_index == -1)
{
|| second_vec_index == -1)
{
second_vec_index = vec_index;
- mask_element += nunits;
+ mask_element += const_nunits;
}
else
{
return false;
}
- gcc_assert (mask_element < 2 * nunits);
- if (mask_element != index)
- noop_p = false;
- mask[index++] = mask_element;
+ gcc_assert (mask_element < 2 * const_nunits);
+ }
+
+ if (mask_element != index)
+ noop_p = false;
+ mask[index++] = mask_element;
- if (index == nunits && !noop_p)
+ if (index == count && !noop_p)
+ {
+ indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
+ if (!can_vec_perm_const_p (mode, indices))
{
- indices.new_vector (mask, 2, nunits);
- if (!can_vec_perm_const_p (mode, indices))
+ if (dump_enabled_p ())
{
- if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+ vect_location,
+ "unsupported vect permute { ");
+ for (i = 0; i < count; ++i)
{
- dump_printf_loc (MSG_MISSED_OPTIMIZATION,
- vect_location,
- "unsupported vect permute { ");
- for (i = 0; i < nunits; ++i)
- {
- dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
- dump_printf (MSG_MISSED_OPTIMIZATION, " ");
- }
- dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
+ dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
+ dump_printf (MSG_MISSED_OPTIMIZATION, " ");
}
- gcc_assert (analyze_only);
- return false;
+ dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
}
-
- ++*n_perms;
+ gcc_assert (analyze_only);
+ return false;
}
- if (index == nunits)
+ ++*n_perms;
+ }
+
+ if (index == count)
+ {
+ if (!analyze_only)
{
- if (!analyze_only)
- {
- tree mask_vec = NULL_TREE;
+ tree mask_vec = NULL_TREE;
- if (! noop_p)
- mask_vec = vec_perm_indices_to_tree (mask_type, indices);
+ if (! noop_p)
+ mask_vec = vect_gen_perm_mask_checked (vectype, indices);
- if (second_vec_index == -1)
- second_vec_index = first_vec_index;
+ if (second_vec_index == -1)
+ second_vec_index = first_vec_index;
+ for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
+ {
/* Generate the permute statement if necessary. */
- tree first_vec = dr_chain[first_vec_index];
- tree second_vec = dr_chain[second_vec_index];
+ tree first_vec = dr_chain[first_vec_index + ri];
+ tree second_vec = dr_chain[second_vec_index + ri];
stmt_vec_info perm_stmt_info;
if (! noop_p)
{
SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++]
= perm_stmt_info;
}
-
- index = 0;
- first_vec_index = -1;
- second_vec_index = -1;
- noop_p = true;
}
+
+ index = 0;
+ first_vec_index = -1;
+ second_vec_index = -1;
+ noop_p = true;
}
}
if (slp)
{
grouped_load = false;
- /* For SLP permutation support we need to load the whole group,
- not only the number of vector stmts the permutation result
- fits in. */
- if (slp_perm)
+ /* If an SLP permutation is from N elements to N elements,
+ and if one vector holds a whole number of N, we can load
+ the inputs to the permutation in the same way as an
+ unpermuted sequence. In other cases we need to load the
+ whole group, not only the number of vector stmts the
+ permutation result fits in. */
+ if (slp_perm
+ && (group_size != SLP_INSTANCE_GROUP_SIZE (slp_node_instance)
+ || !multiple_p (nunits, group_size)))
{
- /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
- variable VF. */
+ /* We don't yet generate such SLP_TREE_LOAD_PERMUTATIONs for
+ variable VF; see vect_transform_slp_perm_load. */
unsigned int const_vf = vf.to_constant ();
unsigned int const_nunits = nunits.to_constant ();
vec_num = CEIL (group_size * const_vf, const_nunits);