-2017-12-12 Jeff Law <law@redhat.com>
+2017-12-15 Alexandre Oliva <aoliva@redhat.com>
+
+ PR tree-optimization/81165
+ * tree-ssa-threadupdate.c (uses_in_bb): New.
+ (estimate_threading_killed_stmts): New.
+ * tree-ssa-threadupdate.h (estimate_threading_killed_stmts): Prototype.
+ * tree-ssa-threadedge.c
+ (record_temporary_equivalences_from_stmts_at_dest): Expand limit
+ when its hit.
+
+2017-12-15 Jeff Law <law@redhat.com>
PR tree-optimization/83410
* tree-ssa-threadupdate.c (thread_block_1): Avoid certain jump
+2017-12-15 Alexandre Oliva <aoliva@redhat.com>
+
+ PR tree-optimization/81165
+ * gcc.dg/pr81165.c: New.
+
2017-12-15 Jakub Jelinek <jakub@redhat.com>
PR c++/83205
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not " \[/%\] " "optimized" } } */
+
+/* Testcase submitted for PR81165, with its main function removed as
+ it's turned into a compile test. We want to make sure that all of
+ the divide/remainder computations are removed by tree optimizers.
+
+ We can figure out that we don't need to compute at runtime even the
+ condition to enter the loop: the initial i==0 would have to be
+ greater than the sum of two small unsigned values: 1U>>t1 is in the
+ range 0..1, whereas the char value is bounded by the range 0..127,
+ being 128 % a positive number (zero would invoke undefined
+ behavior, so we can assume it doesn't happen). (We know it's
+ nonnegative because it's 10 times a number that has no more than
+ the bits for 16, 8 and 1 set.)
+
+ We don't realize that the loop is useless right away: jump
+ threading helps remove some of the complexity, particularly of the
+ computation within the loop: t1 is compared with 1, but it can
+ never be 1. (We could assume as much, since its being 1 would
+ divide by zero, but we don't.)
+
+ If we don't enter the conditional block, t1 remains at 2; if we do,
+ it's set to either -1. If we jump thread at the end of the
+ conditional block, we can figure out the ranges exclude 1 and the
+ jump body is completely optimized out. However, we used to fail to
+ consider the block for jump threading due to the amount of
+ computation in it, without realizing most of it would die in
+ consequence of the threading.
+
+ We now take the dying code into account when deciding whether or
+ not to try jump threading. That might enable us to optimize the
+ function into { if (x2 != 0 || (x1 & 1) == 0) abort (); }. At the
+ time of this writing, with the patch, we get close, but the test on
+ x2 only gets as far as ((1 >> x2) == 0). Without the patch, some
+ of the loop remains. */
+
+short x0 = 15;
+
+void func (){
+ volatile int x1 = 1U;
+ volatile char x2 = 0;
+ char t0 = 0;
+ unsigned long t1 = 2LU;
+ int i = 0;
+
+ if(1>>x2) {
+ t0 = -1;
+ t1 = (1&(short)(x1^8U))-1;
+ }
+
+ while(i > (int)((1U>>t1)+(char)(128%(10*(25LU&(29%x0)))))) {
+ i += (int)(12L/(1!=(int)t1));
+ }
+
+ if (t0 != -1) __builtin_abort();
+ if (t1 != 0L) __builtin_abort();
+}
expansion, then do not thread through this block. */
stmt_count++;
if (stmt_count > max_stmt_count)
- return NULL;
+ {
+ /* If any of the stmts in the PATH's dests are going to be
+ killed due to threading, grow the max count
+ accordingly. */
+ if (max_stmt_count
+ == PARAM_VALUE (PARAM_MAX_JUMP_THREAD_DUPLICATION_STMTS))
+ {
+ max_stmt_count += estimate_threading_killed_stmts (e->dest);
+ if (dump_file)
+ fprintf (dump_file, "threading bb %i up to %i stmts\n",
+ e->dest->index, max_stmt_count);
+ }
+ /* If we're still past the limit, we're done. */
+ if (stmt_count > max_stmt_count)
+ return NULL;
+ }
/* These are temporary ranges, do nto reflect them back into
the global range data. */
paths.safe_push (path);
}
+
+/* Return how many uses of T there are within BB, as long as there
+ aren't any uses outside BB. If there are any uses outside BB,
+ return -1 if there's at most one use within BB, or -2 if there is
+ more than one use within BB. */
+
+static int
+uses_in_bb (tree t, basic_block bb)
+{
+ int uses = 0;
+ bool outside_bb = false;
+
+ imm_use_iterator iter;
+ use_operand_p use_p;
+ FOR_EACH_IMM_USE_FAST (use_p, iter, t)
+ {
+ if (is_gimple_debug (USE_STMT (use_p)))
+ continue;
+
+ if (gimple_bb (USE_STMT (use_p)) != bb)
+ outside_bb = true;
+ else
+ uses++;
+
+ if (outside_bb && uses > 1)
+ return -2;
+ }
+
+ if (outside_bb)
+ return -1;
+
+ return uses;
+}
+
+/* Starting from the final control flow stmt in BB, assuming it will
+ be removed, follow uses in to-be-removed stmts back to their defs
+ and count how many defs are to become dead and be removed as
+ well. */
+
+unsigned int
+estimate_threading_killed_stmts (basic_block bb)
+{
+ int killed_stmts = 0;
+ hash_map<tree, int> ssa_remaining_uses;
+ auto_vec<gimple *, 4> dead_worklist;
+
+ /* If the block has only two predecessors, threading will turn phi
+ dsts into either src, so count them as dead stmts. */
+ bool drop_all_phis = EDGE_COUNT (bb->preds) == 2;
+
+ if (drop_all_phis)
+ for (gphi_iterator gsi = gsi_start_phis (bb);
+ !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gphi *phi = gsi.phi ();
+ tree dst = gimple_phi_result (phi);
+
+ /* We don't count virtual PHIs as stmts in
+ record_temporary_equivalences_from_phis. */
+ if (virtual_operand_p (dst))
+ continue;
+
+ killed_stmts++;
+ }
+
+ if (gsi_end_p (gsi_last_bb (bb)))
+ return killed_stmts;
+
+ gimple *stmt = gsi_stmt (gsi_last_bb (bb));
+ if (gimple_code (stmt) != GIMPLE_COND
+ && gimple_code (stmt) != GIMPLE_GOTO
+ && gimple_code (stmt) != GIMPLE_SWITCH)
+ return killed_stmts;
+
+ /* The control statement is always dead. */
+ killed_stmts++;
+ dead_worklist.quick_push (stmt);
+ while (!dead_worklist.is_empty ())
+ {
+ stmt = dead_worklist.pop ();
+
+ ssa_op_iter iter;
+ use_operand_p use_p;
+ FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
+ {
+ tree t = USE_FROM_PTR (use_p);
+ gimple *def = SSA_NAME_DEF_STMT (t);
+
+ if (gimple_bb (def) == bb
+ && (gimple_code (def) != GIMPLE_PHI
+ || !drop_all_phis)
+ && !gimple_has_side_effects (def))
+ {
+ int *usesp = ssa_remaining_uses.get (t);
+ int uses;
+
+ if (usesp)
+ uses = *usesp;
+ else
+ uses = uses_in_bb (t, bb);
+
+ gcc_assert (uses);
+
+ /* Don't bother recording the expected use count if we
+ won't find any further uses within BB. */
+ if (!usesp && (uses < -1 || uses > 1))
+ {
+ usesp = &ssa_remaining_uses.get_or_insert (t);
+ *usesp = uses;
+ }
+
+ if (uses < 0)
+ continue;
+
+ --uses;
+ if (usesp)
+ *usesp = uses;
+
+ if (!uses)
+ {
+ killed_stmts++;
+ if (usesp)
+ ssa_remaining_uses.remove (t);
+ if (gimple_code (def) != GIMPLE_PHI)
+ dead_worklist.safe_push (def);
+ }
+ }
+ }
+ }
+
+ if (dump_file)
+ fprintf (dump_file, "threading bb %i kills %i stmts\n",
+ bb->index, killed_stmts);
+
+ return killed_stmts;
+}
extern void delete_jump_thread_path (vec <class jump_thread_edge *> *);
extern void remove_ctrl_stmt_and_useless_edges (basic_block, basic_block);
extern void free_dom_edge_info (edge);
+extern unsigned int estimate_threading_killed_stmts (basic_block);
enum bb_dom_status
{