}
/* Copies copy of LOOP as subloop of TARGET loop, placing newly
- created loop into loops structure. */
+ created loop into loops structure. If AFTER is non-null
+ the new loop is added at AFTER->next, otherwise in front of TARGETs
+ sibling list. */
struct loop *
-duplicate_loop (struct loop *loop, struct loop *target)
+duplicate_loop (struct loop *loop, struct loop *target, struct loop *after)
{
struct loop *cloop;
cloop = alloc_loop ();
set_loop_copy (loop, cloop);
/* Add it to target. */
- flow_loop_tree_node_add (target, cloop);
+ flow_loop_tree_node_add (target, cloop, after);
return cloop;
}
/* Copies structure of subloops of LOOP into TARGET loop, placing
- newly created loops into loop tree. */
+ newly created loops into loop tree at the end of TARGETs sibling
+ list in the original order. */
void
duplicate_subloops (struct loop *loop, struct loop *target)
{
- struct loop *aloop, *cloop;
+ struct loop *aloop, *cloop, *tail;
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
+ ;
for (aloop = loop->inner; aloop; aloop = aloop->next)
{
- cloop = duplicate_loop (aloop, target);
+ cloop = duplicate_loop (aloop, target, tail);
+ tail = cloop;
+ gcc_assert(!tail->next);
duplicate_subloops (aloop, cloop);
}
}
/* Copies structure of subloops of N loops, stored in array COPIED_LOOPS,
- into TARGET loop, placing newly created loops into loop tree. */
+ into TARGET loop, placing newly created loops into loop tree adding
+ them to TARGETs sibling list at the end in order. */
static void
copy_loops_to (struct loop **copied_loops, int n, struct loop *target)
{
- struct loop *aloop;
+ struct loop *aloop, *tail;
int i;
+ for (tail = target->inner; tail && tail->next; tail = tail->next)
+ ;
for (i = 0; i < n; i++)
{
- aloop = duplicate_loop (copied_loops[i], target);
+ aloop = duplicate_loop (copied_loops[i], target, tail);
+ tail = aloop;
+ gcc_assert(!tail->next);
duplicate_subloops (copied_loops[i], aloop);
}
}
}
/* Duplicates body of LOOP to given edge E NDUPL times. Takes care of updating
- loop structure and dominators. E's destination must be LOOP header for
- this to work, i.e. it must be entry or latch edge of this loop; these are
- unique, as the loops must have preheaders for this function to work
- correctly (in case E is latch, the function unrolls the loop, if E is entry
- edge, it peels the loop). Store edges created by copying ORIG edge from
- copies corresponding to set bits in WONT_EXIT bitmap (bit 0 corresponds to
- original LOOP body, the other copies are numbered in order given by control
- flow through them) into TO_REMOVE array. Returns false if duplication is
+ loop structure and dominators (order of inner subloops is retained).
+ E's destination must be LOOP header for this to work, i.e. it must be entry
+ or latch edge of this loop; these are unique, as the loops must have
+ preheaders for this function to work correctly (in case E is latch, the
+ function unrolls the loop, if E is entry edge, it peels the loop). Store
+ edges created by copying ORIG edge from copies corresponding to set bits in
+ WONT_EXIT bitmap (bit 0 corresponds to original LOOP body, the other copies
+ are numbered in order given by control flow through them) into TO_REMOVE
+ array. Returns false if duplication is
impossible. */
bool
--- /dev/null
+/* Loop unroll-and-jam.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 3, or (at your option) any
+later version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "params.h"
+#include "tree-pass.h"
+#include "backend.h"
+#include "tree.h"
+#include "gimple.h"
+#include "ssa.h"
+#include "fold-const.h"
+#include "tree-cfg.h"
+#include "tree-ssa.h"
+#include "tree-ssa-loop-niter.h"
+#include "tree-ssa-loop.h"
+#include "tree-ssa-loop-manip.h"
+#include "cfgloop.h"
+#include "tree-scalar-evolution.h"
+#include "gimple-iterator.h"
+#include "cfghooks.h"
+#include "tree-data-ref.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-vectorizer.h"
+
+/* Unroll and Jam transformation
+
+ This is a combination of two transformations, where the second
+ is not always valid. It's applicable if a loop nest has redundancies
+ over the iterations of an outer loop while not having that with
+ an inner loop.
+
+ Given this nest:
+ for (i) {
+ for (j) {
+ B(i,j)
+ }
+ }
+
+ first unroll:
+ for (i by 2) {
+ for (j) {
+ B(i,j)
+ }
+ for (j) {
+ B(i+1,j)
+ }
+ }
+
+ then fuse the two adjacent inner loops resulting from that:
+ for (i by 2) {
+ for (j) {
+ B(i,j)
+ B(i+1,j)
+ }
+ }
+
+ As the order of evaluations of the body B changes this is valid
+ only in certain situations: all distance vectors need to be forward.
+ Additionally if there are multiple induction variables than just
+ a counting control IV (j above) we can also deal with some situations.
+
+ The validity is checked by unroll_jam_possible_p, and the data-dep
+ testing below.
+
+ A trivial example where the fusion is wrong would be when
+ B(i,j) == x[j-1] = x[j];
+ for (i by 2) {
+ for (j) {
+ x[j-1] = x[j];
+ }
+ for (j) {
+ x[j-1] = x[j];
+ }
+ } effect: move content to front by two elements
+ -->
+ for (i by 2) {
+ for (j) {
+ x[j-1] = x[j];
+ x[j-1] = x[j];
+ }
+ } effect: move content to front by one element
+*/
+
+/* Modify the loop tree for the fact that all code once belonging
+ to the OLD loop or the outer loop of OLD now is inside LOOP. */
+
+static void
+merge_loop_tree (struct loop *loop, struct loop *old)
+{
+ basic_block *bbs;
+ int i, n;
+ struct loop *subloop;
+ edge e;
+ edge_iterator ei;
+
+ /* Find its nodes. */
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+ n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
+
+ for (i = 0; i < n; i++)
+ {
+ /* If the block was direct child of OLD loop it's now part
+ of LOOP. If it was outside OLD, then it moved into LOOP
+ as well. This avoids changing the loop father for BBs
+ in inner loops of OLD. */
+ if (bbs[i]->loop_father == old
+ || loop_depth (bbs[i]->loop_father) < loop_depth (old))
+ {
+ remove_bb_from_loops (bbs[i]);
+ add_bb_to_loop (bbs[i], loop);
+ continue;
+ }
+
+ /* If we find a direct subloop of OLD, move it to LOOP. */
+ subloop = bbs[i]->loop_father;
+ if (loop_outer (subloop) == old && subloop->header == bbs[i])
+ {
+ flow_loop_tree_node_remove (subloop);
+ flow_loop_tree_node_add (loop, subloop);
+ }
+ }
+
+ /* Update the information about loop exit edges. */
+ for (i = 0; i < n; i++)
+ {
+ FOR_EACH_EDGE (e, ei, bbs[i]->succs)
+ {
+ rescan_loop_exit (e, false, false);
+ }
+ }
+
+ loop->num_nodes = n;
+
+ free (bbs);
+}
+
+/* BB exits the outer loop of an unroll-and-jam situation.
+ Check if any statements therein would prevent the transformation. */
+
+static bool
+bb_prevents_fusion_p (basic_block bb)
+{
+ gimple_stmt_iterator gsi;
+ /* BB is duplicated by outer unrolling and then all N-1 first copies
+ move into the body of the fused inner loop. The last copy remains
+ the exit block of the outer loop and is still outside the inner loop
+ also after fusion. We can't allow this for some effects of BB:
+ * stores or unknown side-effects prevent fusion
+ * loads don't
+ * computations into SSA names: these aren't problematic. Their
+ result will be unused on the exit edges of the first N-1 copies
+ (those aren't taken after unrolling). If they are used on the
+ other edge (the one leading to the outer latch block) they are
+ loop-carried (on the outer loop) and the Nth copy of BB will
+ compute them again (i.e. the first N-1 copies will be dead). */
+ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gimple *g = gsi_stmt (gsi);
+ if (gimple_vdef (g) || gimple_has_side_effects (g))
+ return true;
+ }
+ return false;
+}
+
+/* Given an inner loop LOOP (of some OUTER loop) determine if
+ we can safely fuse copies of it (generated by outer unrolling).
+ If so return true, otherwise return false. */
+
+static bool
+unroll_jam_possible_p (struct loop *outer, struct loop *loop)
+{
+ basic_block *bbs;
+ int i, n;
+ struct tree_niter_desc niter;
+
+ /* When fusing the loops we skip the latch block
+ of the first one, so it mustn't have any effects to
+ preserve. */
+ if (!empty_block_p (loop->latch))
+ return false;
+
+ if (!single_exit (loop))
+ return false;
+
+ /* We need a perfect nest. Quick check for adjacent inner loops. */
+ if (outer->inner != loop || loop->next)
+ return false;
+
+ /* The number of iterations of the inner loop must be loop invariant
+ with respect to the outer loop. */
+ if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
+ false, true)
+ || niter.cmp == ERROR_MARK
+ || !integer_zerop (niter.may_be_zero)
+ || !expr_invariant_in_loop_p (outer, niter.niter))
+ return false;
+
+ /* And check blocks belonging to just outer loop. */
+ bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
+ n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
+
+ for (i = 0; i < n; i++)
+ {
+ if (bbs[i]->loop_father == outer
+ && bbs[i] != outer->latch && bbs[i] != outer->header
+ && (!loop_exits_from_bb_p (outer, bbs[i])
+ || bb_prevents_fusion_p (bbs[i])))
+ break;
+ /* XXX Note that the above disallows head-controlled inner loops,
+ that we usually have. The guard block would need to be accepted
+ (invariant condition either entering or skipping the loop),
+ without also accepting arbitrary control flow. When unswitching
+ ran before us (as with -O3) this won't be a problem because its
+ outer loop unswitching will have moved out the invariant condition.
+
+ If we do that we need to extend fuse_loops() to cope with this
+ by threading through the (still invariant) copied condition
+ between the two loop copies. */
+ }
+ free (bbs);
+ if (i != n)
+ return false;
+
+ /* For now we can safely fuse copies of LOOP only if all
+ loop carried variables are inductions (or the virtual op).
+
+ We could handle reductions as well (the initial value in the second
+ body would be the after-iter value of the first body) if it's over
+ an associative and commutative operation. We wouldn't
+ be able to handle unknown cycles. */
+ gphi_iterator psi;
+ for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
+ {
+ affine_iv iv;
+ tree op = gimple_phi_result (psi.phi ());
+
+ if (virtual_operand_p (op))
+ continue;
+ if (!simple_iv (loop, loop, op, &iv, true))
+ return false;
+ /* The inductions must be regular, loop invariant step and initial
+ value. */
+ if (!expr_invariant_in_loop_p (outer, iv.step)
+ || !expr_invariant_in_loop_p (outer, iv.base))
+ return false;
+ /* XXX With more effort we could also be able to deal with inductions
+ where the initial value is loop variant but a simple IV in the
+ outer loop. The initial value for the second body would be
+ the original initial value plus iv.base.step. The next value
+ for the fused loop would be the original next value of the first
+ copy, _not_ the next value of the second body. */
+ }
+
+ return true;
+}
+
+/* Fuse LOOP with all further neighbors. The loops are expected to
+ be in appropriate form. */
+
+static void
+fuse_loops (struct loop *loop)
+{
+ struct loop *next = loop->next;
+
+ while (next)
+ {
+ edge e;
+
+ remove_branch (single_pred_edge (loop->latch));
+ /* Make delete_basic_block not fiddle with the loop structure. */
+ basic_block oldlatch = loop->latch;
+ loop->latch = NULL;
+ delete_basic_block (oldlatch);
+ e = redirect_edge_and_branch (loop_latch_edge (next),
+ loop->header);
+ loop->latch = e->src;
+ flush_pending_stmts (e);
+
+ gcc_assert (EDGE_COUNT (next->header->preds) == 1);
+
+ /* The PHI nodes of the second body (single-argument now)
+ need adjustments to use the right values: either directly
+ the value of the corresponding PHI in the first copy or
+ the one leaving the first body which unrolling did for us.
+
+ See also unroll_jam_possible_p() for further possibilities. */
+ gphi_iterator psi_first, psi_second;
+ e = single_pred_edge (next->header);
+ for (psi_first = gsi_start_phis (loop->header),
+ psi_second = gsi_start_phis (next->header);
+ !gsi_end_p (psi_first);
+ gsi_next (&psi_first), gsi_next (&psi_second))
+ {
+ gphi *phi_first = psi_first.phi ();
+ gphi *phi_second = psi_second.phi ();
+ tree firstop = gimple_phi_result (phi_first);
+ /* The virtual operand is correct already as it's
+ always live at exit, hence has a LCSSA node and outer
+ loop unrolling updated SSA form. */
+ if (virtual_operand_p (firstop))
+ continue;
+
+ /* Due to unroll_jam_possible_p() we know that this is
+ an induction. The second body goes over the same
+ iteration space. */
+ add_phi_arg (phi_second, firstop, e,
+ gimple_location (phi_first));
+ }
+ gcc_assert (gsi_end_p (psi_second));
+
+ merge_loop_tree (loop, next);
+ gcc_assert (!next->num_nodes);
+ struct loop *ln = next->next;
+ delete_loop (next);
+ next = ln;
+ }
+ rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
+}
+
+/* Returns true if the distance in DDR can be determined and adjusts
+ the unroll factor in *UNROLL to make unrolling valid for that distance.
+ Otherwise return false.
+
+ If this data dep can lead to a removed memory reference, increment
+ *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
+ for this to happen. */
+
+static bool
+adjust_unroll_factor (struct data_dependence_relation *ddr,
+ unsigned *unroll, unsigned *profit_unroll,
+ unsigned *removed)
+{
+ bool ret = false;
+ if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
+ {
+ if (DDR_NUM_DIST_VECTS (ddr) == 0)
+ return false;
+ unsigned i;
+ lambda_vector dist_v;
+ FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
+ {
+ /* A distance (a,b) is at worst transformed into (a/N,b) by the
+ unrolling (factor N), so the transformation is valid if
+ a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
+ factor needs to be limited so that the first condition holds.
+ That may limit the factor down to zero in the worst case. */
+ int dist = dist_v[0];
+ if (dist < 0)
+ gcc_unreachable ();
+ else if ((unsigned)dist >= *unroll)
+ ;
+ else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+ || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
+ && dist > 0))
+ ;
+ else
+ *unroll = dist;
+
+ /* With a distance (a,0) it's always profitable to unroll-and-jam
+ (by a+1), because one memory reference will go away. With
+ (a,b) and b != 0 that's less clear. We will increase the
+ number of streams without lowering the number of mem refs.
+ So for now only handle the first situation. */
+ if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
+ {
+ *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
+ (*removed)++;
+ }
+
+ ret = true;
+ }
+ }
+ return ret;
+}
+
+/* Main entry point for the unroll-and-jam transformation
+ described above. */
+
+static unsigned int
+tree_loop_unroll_and_jam (void)
+{
+ struct loop *loop;
+ bool changed = false;
+
+ gcc_assert (scev_initialized_p ());
+
+ /* Go through all innermost loops. */
+ FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
+ {
+ struct loop *outer = loop_outer (loop);
+
+ if (loop_depth (loop) < 2
+ || optimize_loop_nest_for_size_p (outer))
+ continue;
+
+ if (!unroll_jam_possible_p (outer, loop))
+ continue;
+
+ vec<data_reference_p> datarefs;
+ vec<ddr_p> dependences;
+ unsigned unroll_factor, profit_unroll, removed;
+ struct tree_niter_desc desc;
+ bool unroll = false;
+
+ auto_vec<loop_p, 3> loop_nest;
+ dependences.create (10);
+ datarefs.create (10);
+ if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
+ &datarefs, &dependences))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Cannot analyze data dependencies\n");
+ free_data_refs (datarefs);
+ free_dependence_relations (dependences);
+ return false;
+ }
+ if (!datarefs.length ())
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_data_dependence_relations (dump_file, dependences);
+
+ unroll_factor = (unsigned)-1;
+ profit_unroll = 1;
+ removed = 0;
+
+ /* Check all dependencies. */
+ unsigned i;
+ struct data_dependence_relation *ddr;
+ FOR_EACH_VEC_ELT (dependences, i, ddr)
+ {
+ struct data_reference *dra, *drb;
+
+ /* If the refs are independend there's nothing to do. */
+ if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
+ continue;
+ dra = DDR_A (ddr);
+ drb = DDR_B (ddr);
+ /* Nothing interesting for the self dependencies. */
+ if (dra == drb)
+ continue;
+
+ /* Now check the distance vector, for determining a sensible
+ outer unroll factor, and for validity of merging the inner
+ loop copies. */
+ if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
+ &removed))
+ {
+ /* Couldn't get the distance vector. For two reads that's
+ harmless (we assume we should unroll). For at least
+ one write this means we can't check the dependence direction
+ and hence can't determine safety. */
+
+ if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
+ {
+ unroll_factor = 0;
+ break;
+ }
+ }
+ }
+
+ /* We regard a user-specified minimum percentage of zero as a request
+ to ignore all profitability concerns and apply the transformation
+ always. */
+ if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+ profit_unroll = 2;
+ else if (removed * 100 / datarefs.length ()
+ < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
+ profit_unroll = 1;
+ if (unroll_factor > profit_unroll)
+ unroll_factor = profit_unroll;
+ if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
+ unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
+ unroll = (unroll_factor > 1
+ && can_unroll_loop_p (outer, unroll_factor, &desc));
+
+ if (unroll)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
+ find_loop_location (outer),
+ "applying unroll and jam with factor %d\n",
+ unroll_factor);
+ initialize_original_copy_tables ();
+ tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
+ &desc);
+ free_original_copy_tables ();
+ fuse_loops (outer->inner);
+ changed = true;
+ }
+
+ loop_nest.release ();
+ free_dependence_relations (dependences);
+ free_data_refs (datarefs);
+ }
+
+ if (changed)
+ {
+ scev_reset ();
+ free_dominance_info (CDI_DOMINATORS);
+ return TODO_cleanup_cfg;
+ }
+ return 0;
+}
+
+/* Pass boilerplate */
+
+namespace {
+
+const pass_data pass_data_loop_jam =
+{
+ GIMPLE_PASS, /* type */
+ "unrolljam", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_LOOP_JAM, /* tv_id */
+ PROP_cfg, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_loop_jam : public gimple_opt_pass
+{
+public:
+ pass_loop_jam (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_loop_jam, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *) { return flag_unroll_jam != 0; }
+ virtual unsigned int execute (function *);
+
+};
+
+unsigned int
+pass_loop_jam::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ return tree_loop_unroll_and_jam ();
+}
+
+}
+
+gimple_opt_pass *
+make_pass_loop_jam (gcc::context *ctxt)
+{
+ return new pass_loop_jam (ctxt);
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -funroll-and-jam --param unroll-jam-min-percent=0 -fdump-tree-unrolljam-details" } */
+/* { dg-require-effective-target int32plus } */
+
+#include <stdio.h>
+extern unsigned int a[];
+extern unsigned int b[];
+extern unsigned int aa[][1024];
+unsigned int checksum;
+void checkaa(void)
+{
+ unsigned sum = 1;
+ unsigned long i, j;
+ for (i = 0; i < 1024; i++) {
+ for (j = 0; j < 16; j++) {
+ sum += aa[j][i]*31+47;
+ }
+ }
+ checksum = checksum * 27 + sum;
+ //printf(" %d\n", sum);
+}
+
+void checkb(void)
+{
+ unsigned sum = 1;
+ unsigned long i, j;
+ for (i = 0; i < 1024; i++) {
+ sum += b[i]*31+47;
+ }
+ checksum = checksum * 27 + sum;
+ //printf(" %d\n", sum);
+}
+
+#define TEST(name, body, test) \
+static void __attribute__((noinline,noclone)) name (unsigned long n, unsigned long m) \
+{ \
+ unsigned long i, j; \
+ for (i = 1; i < m; i++) { \
+ for (j = 1; j < n; j++) { \
+ body; \
+ } \
+ } \
+ test; \
+} \
+static void __attribute__((noinline,noclone,optimize("O1"))) name ## noopt (unsigned long n, unsigned long m) \
+{ \
+ unsigned long i, j; \
+ for (i = 1; i < m; i++) { \
+ for (j = 1; j < n; j++) { \
+ body; \
+ } \
+ } \
+ test; \
+}
+TEST(foo1, aa[i+1][j+1]=aa[i][j] * aa[i][j] / 2, checkaa()) //ok, -1,-1
+TEST(foo2, aa[i][j+1]=3*aa[i+1][j], checkaa()) //notok, 1,-1
+TEST(foo3, aa[i+1][j-1]=aa[i][j] * aa[i][j] / 2, checkaa()) //notok, -1,1
+TEST(foo4, aa[i][j] = aa[i-1][j+1] * aa[i-1][j+1] / 2, checkaa()) //notok, -1,1
+TEST(foo5, aa[i][j] = aa[i+1][j+1] * aa[i+1][j+1] / 2, checkaa()) //ok, 1,1
+TEST(foo6, aa[i][j] = aa[i+1][j] * aa[i+1][j] / 2, checkaa()) //ok, -1,0
+TEST(foo7, aa[i+1][j] = aa[i][j] * aa[i][j] / 2, checkaa()) //ok, 1,0
+TEST(foo9, b[j] = 3*b[j+1] + 1, checkb()) //notok, 0,-1
+TEST(foo10, b[j] = 3*b[j] + 1, checkb()) //ok, 0,0
+
+/* foo8 should work as well, but currently doesn't because the distance
+ vectors we compute are too pessimistic. We compute
+ (0,1), (1,1) and (1,-1)
+ and the last one causes us to lose. */
+TEST(foo8, b[j+1] = 3*b[j] + 1, checkb()) //ok, 0,1
+
+unsigned int a[1024];
+unsigned int b[1024];
+unsigned int aa[16][1024];
+void init(void)
+{
+ unsigned long i,j;
+ for (i = 0; i < 1024; i++) {
+ for (j = 0; j < 16; j++) {
+ aa[j][i] = ((j+1)*2+i+1) % 17;
+ }
+ a[i] = ((i+1)*31) % 19;
+ b[i] = ((i+1)*47) % 23;
+ }
+ checksum = 1;
+}
+
+#define RUN(name) \
+ printf(" %s\n", #name); \
+ init();for(i=0;i<4;i++)name##noopt(32,8); checka = checksum; \
+ init();for(i=0;i<4;i++)name(32,8); \
+ printf("%sok %s\n", checka != checksum ? "NOT " : "", #name);
+
+int main()
+{
+ int i;
+ unsigned checka;
+ RUN(foo1);
+ RUN(foo2);
+ RUN(foo3);
+ RUN(foo4);
+ RUN(foo5);
+ RUN(foo6);
+ RUN(foo7);
+ RUN(foo8);
+ RUN(foo9);
+ RUN(foo10);
+ return 0;
+}
+
+/* Five loops should be unroll-jammed (actually six, but see above). */
+/* { dg-final { scan-tree-dump-times "applying unroll and jam" 5 "unrolljam" } } */