/* Kind of distributed loop. */
enum partition_kind {
- PKIND_NORMAL, PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
+ PKIND_NORMAL,
+ /* Partial memset stands for a paritition can be distributed into a loop
+ of memset calls, rather than a single memset call. It's handled just
+ like a normal parition, i.e, distributed as separate loop, no memset
+ call is generated.
+
+ Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
+ loop nest as deep as possible. As a result, parloop achieves better
+ parallelization by parallelizing deeper loop nest. This hack should
+ be unnecessary and removed once distributed memset can be understood
+ and analyzed in data reference analysis. See PR82604 for more. */
+ PKIND_PARTIAL_MEMSET,
+ PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
};
/* Type of distributed loop. */
static bool
partition_builtin_p (partition *partition)
{
- return partition->kind != PKIND_NORMAL;
+ return partition->kind > PKIND_PARTIAL_MEMSET;
}
/* Returns true if the partition contains a reduction. */
switch (partition->kind)
{
case PKIND_NORMAL:
+ case PKIND_PARTIAL_MEMSET:
/* Reductions all have to be in the last partition. */
gcc_assert (!partition_reduction_p (partition)
|| !copy_p);
/* Given data reference DR in LOOP_NEST, this function checks the enclosing
loops from inner to outer to see if loop's step equals to access size at
- each level of loop. Return true if yes; record access base and size in
- BASE and SIZE; save loop's step at each level of loop in STEPS if it is
- not null. For example:
+ each level of loop. Return 2 if we can prove this at all level loops;
+ record access base and size in BASE and SIZE; save loop's step at each
+ level of loop in STEPS if it is not null. For example:
int arr[100][100][100];
for (i = 0; i < 100; i++) ;steps[2] = 40000
for (j = 100; j > 0; j--) ;steps[1] = -400
for (k = 0; k < 100; k++) ;steps[0] = 4
- arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000. */
+ arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
-static bool
+ Return 1 if we can prove the equality at the innermost loop, but not all
+ level loops. In this case, no information is recorded.
+
+ Return 0 if no equality can be proven at any level loops. */
+
+static int
compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
tree *size, vec<tree> *steps = NULL)
{
tree ref = DR_REF (dr);
tree access_base = build_fold_addr_expr (ref);
tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
+ int res = 0;
do {
tree scev_fn = analyze_scalar_evolution (loop, access_base);
if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
- return false;
+ return res;
access_base = CHREC_LEFT (scev_fn);
if (tree_contains_chrecs (access_base, NULL))
- return false;
+ return res;
tree scev_step = CHREC_RIGHT (scev_fn);
/* Only support constant steps. */
if (TREE_CODE (scev_step) != INTEGER_CST)
- return false;
+ return res;
enum ev_direction access_dir = scev_direction (scev_fn);
if (access_dir == EV_DIR_UNKNOWN)
- return false;
+ return res;
if (steps != NULL)
steps->safe_push (scev_step);
/* At each level of loop, scev step must equal to access size. In other
words, DR must access consecutive memory between loop iterations. */
if (!operand_equal_p (scev_step, access_size, 0))
- return false;
+ return res;
+
+ /* Access stride can be computed for data reference at least for the
+ innermost loop. */
+ res = 1;
/* Compute DR's execution times in loop. */
tree niters = number_of_latch_executions (loop);
*base = access_base;
*size = access_size;
- return true;
+ /* Access stride can be computed for data reference at each level loop. */
+ return 2;
}
/* Allocate and return builtin struct. Record information like DST_DR,
&& flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
return;
- if (!compute_access_range (loop, dr, &base, &size))
+ int res = compute_access_range (loop, dr, &base, &size);
+ if (res == 0)
return;
+ if (res == 1)
+ {
+ partition->kind = PKIND_PARTIAL_MEMSET;
+ return;
+ }
poly_uint64 base_offset;
unsigned HOST_WIDE_INT const_base_offset;
tree base, size, src_base, src_size;
auto_vec<tree> dst_steps, src_steps;
- /* Compute access range of both load and store. They much have the same
- access size. */
- if (!compute_access_range (loop, dst_dr, &base, &size, &dst_steps)
- || !compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps)
- || !operand_equal_p (size, src_size, 0))
+ /* Compute access range of both load and store. */
+ int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
+ if (res != 2)
+ return;
+ res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
+ if (res != 2)
+ return;
+
+ /* They much have the same access size. */
+ if (!operand_equal_p (size, src_size, 0))
return;
/* Load and store in loop nest must access memory in the same way, i.e,
|| alias_ddrs->length () > 0)
return;
- unsigned num_builtin = 0, num_normal = 0;
+ unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
bool same_type_p = true;
enum partition_type type = ((*partitions)[0])->type;
for (i = 0; partitions->iterate (i, &partition); ++i)
{
same_type_p &= (type == partition->type);
- if (partition->kind != PKIND_NORMAL)
- num_builtin++;
- else
- num_normal++;
+ if (partition_builtin_p (partition))
+ {
+ num_builtin++;
+ continue;
+ }
+ num_normal++;
+ if (partition->kind == PKIND_PARTIAL_MEMSET)
+ num_partial_memset++;
}
/* Don't distribute current loop into too many loops given we don't have
memory stream cost model. Be even more conservative in case of loop
nest distribution. */
- if ((same_type_p && num_builtin == 0)
+ if ((same_type_p && num_builtin == 0
+ && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
|| (loop->inner != NULL
&& i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
|| (loop->inner == NULL
for (i = 0; partitions.iterate (i, &into); ++i)
{
bool changed = false;
- if (partition_builtin_p (into))
+ if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
continue;
for (int j = i + 1;
partitions.iterate (j, &partition); ++j)
struct loop *outer = loop_outer (loop);
tree niters = number_of_latch_executions (loop);
- /* TODO: We only support the innermost 2-level loop nest distribution
+ /* TODO: We only support the innermost 3-level loop nest distribution
because of compilation time issue for now. This should be relaxed
- in the future. */
- while (loop->inner == NULL
+ in the future. Note we only allow 3-level loop nest distribution
+ when parallelizing loops. */
+ while ((loop->inner == NULL
+ || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
&& loop_outer (outer)
&& outer->inner == loop && loop->next == NULL
&& single_exit (outer)