#include "tm-constrs.h"
#include "params.h"
+static int x86_builtin_vectorization_cost (bool);
+
#ifndef CHECK_STACK_LIMIT
#define CHECK_STACK_LIMIT (-1)
#endif
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
- {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
+ {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 1, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 1, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
/* Processor costs (relative to an add) */
DUMMY_STRINGOP_ALGS},
{{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{-1, rep_prefix_4_byte}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
DUMMY_STRINGOP_ALGS},
{{rep_prefix_4_byte, {{1024, unrolled_loop},
{8192, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
{{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
- DUMMY_STRINGOP_ALGS}
+ DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+ {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 5, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 3, /* vec_unalign_load_cost. */
+ 3, /* vec_store_cost. */
+ 6, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
struct processor_costs amdfam10_cost = {
{libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{{libcall, {{8, loop}, {24, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
- {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+ {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 4, /* scalar_stmt_cost. */
+ 2, /* scalar load_cost. */
+ 2, /* scalar_store_cost. */
+ 6, /* vec_stmt_cost. */
+ 0, /* vec_to_scalar_cost. */
+ 2, /* scalar_to_vec_cost. */
+ 2, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 2, /* vec_store_cost. */
+ 6, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
{-1, libcall}}},
{libcall, {{24, loop}, {64, unrolled_loop},
- {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
static const
{{libcall, {{8, loop}, {15, unrolled_loop},
{2048, rep_prefix_4_byte}, {-1, libcall}}},
{libcall, {{24, loop}, {32, unrolled_loop},
- {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+ {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
/* Generic64 should produce code tuned for Nocona and K8. */
{DUMMY_STRINGOP_ALGS,
{libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
{DUMMY_STRINGOP_ALGS,
- {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+ {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
/* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8. */
DUMMY_STRINGOP_ALGS},
{{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
DUMMY_STRINGOP_ALGS},
+ 1, /* scalar_stmt_cost. */
+ 1, /* scalar load_cost. */
+ 1, /* scalar_store_cost. */
+ 1, /* vec_stmt_cost. */
+ 1, /* vec_to_scalar_cost. */
+ 1, /* scalar_to_vec_cost. */
+ 1, /* vec_align_load_cost. */
+ 2, /* vec_unalign_load_cost. */
+ 1, /* vec_store_cost. */
+ 3, /* cond_taken_branch_cost. */
+ 1, /* cond_not_taken_branch_cost. */
};
const struct processor_costs *ix86_cost = &pentium_cost;
{ NULL, 0, 0, false, false, false, NULL }
};
+/* Implement targetm.vectorize.builtin_vectorization_cost. */
+static int
+x86_builtin_vectorization_cost (bool runtime_test)
+{
+ /* If the branch of the runtime test is taken - i.e. - the vectorized
+ version is skipped - this incurs a misprediction cost (because the
+ vectorized version is expected to be the fall-through). So we subtract
+ the latency of a mispredicted branch from the costs that are incured
+ when the vectorized version is executed.
+
+ TODO: The values in individual target tables have to be tuned or new
+ fields may be needed. For eg. on K8, the default branch path is the
+ not-taken path. If the taken path is predicted correctly, the minimum
+ penalty of going down the taken-path is 1 cycle. If the taken-path is
+ not predicted correctly, then the minimum penalty is 10 cycles. */
+
+ if (runtime_test)
+ {
+ return (-(ix86_cost->cond_taken_branch_cost));
+ }
+ else
+ return 0;
+}
+
/* Initialize the GCC target structure. */
#undef TARGET_ATTRIBUTE_TABLE
#define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
#undef TARGET_FUNCTION_VALUE
#define TARGET_FUNCTION_VALUE ix86_function_value
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+
struct gcc_target targetm = TARGET_INITIALIZER;
\f
#include "gt-i386.h"
basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
int nbbs = loop->num_nodes;
int byte_misalign;
+ int peel_guard_costs = 0;
int innerloop_iters = 0, factor;
VEC (slp_instance, heap) *slp_instances;
slp_instance instance;
if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
{
- vec_outside_cost += TARG_COND_BRANCH_COST;
+ vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: Adding cost of checks for loop "
"versioning.\n");
loop.
FORNOW: If we dont know the value of peel_iters for prologue or epilogue
- at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+ at compile-time - we assume it's vf/2 (the worst would be vf-1).
TODO: Build an expression that represents peel_iters for prologue and
epilogue to be used in a run-time test. */
if (byte_misalign < 0)
{
- peel_iters_prologue = (vf - 1)/2;
+ peel_iters_prologue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
- "prologue peel iters set to (vf-1)/2.");
+ "prologue peel iters set to vf/2.");
/* If peeling for alignment is unknown, loop bound of main loop becomes
unknown. */
- peel_iters_epilogue = (vf - 1)/2;
+ peel_iters_epilogue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
- "epilogue peel iters set to (vf-1)/2 because "
+ "epilogue peel iters set to vf/2 because "
"peeling for alignment is unknown .");
+
+ /* If peeled iterations are unknown, count a taken branch and a not taken
+ branch per peeled loop. Even if scalar loop iterations are known,
+ vector iterations are not known since peeled prologue iterations are
+ not known. Hence guards remain the same. */
+ peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST
+ + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
}
else
{
if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
{
- peel_iters_epilogue = (vf - 1)/2;
+ peel_iters_epilogue = vf/2;
if (vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "cost model: "
- "epilogue peel iters set to (vf-1)/2 because "
+ "epilogue peel iters set to vf/2 because "
"loop iterations are unknown .");
+
+ /* If peeled iterations are known but number of scalar loop
+ iterations are unknown, count a taken branch per peeled loop. */
+ peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST;
+
}
else
{
}
}
- /* Requires a prologue loop when peeling to handle misalignment. Add cost of
- two guards, one for the peeled loop and one for the vector loop. */
-
- if (peel_iters_prologue)
- {
- vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "cost model: Adding cost of checks for "
- "prologue.\n");
- }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
- loop. Add cost of two guards, one for the peeled loop and one for the
- vector loop. */
-
- if (peel_iters_epilogue
- || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
- || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
- {
- vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
- if (vect_print_dump_info (REPORT_DETAILS))
- fprintf (vect_dump, "cost model : Adding cost of checks for "
- "epilogue.\n");
- }
-
vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
- + (peel_iters_epilogue * scalar_single_iter_cost);
+ + (peel_iters_epilogue * scalar_single_iter_cost)
+ + peel_guard_costs;
/* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
information we provide for the target is whether testing against the
if ((scalar_single_iter_cost * vf) > vec_inside_cost)
{
- if (vec_outside_cost == 0)
+ if (vec_outside_cost <= 0)
min_profitable_iters = 1;
else
{
- min_profitable_iters = (vec_outside_cost * vf)
+ min_profitable_iters = (vec_outside_cost * vf
+ - vec_inside_cost * peel_iters_prologue
+ - vec_inside_cost * peel_iters_epilogue)
/ ((scalar_single_iter_cost * vf)
- vec_inside_cost);
peel_iters_epilogue);
fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n",
min_profitable_iters);
- fprintf (vect_dump, " Actual minimum iters for profitability: %d\n",
- min_profitable_iters < vf ? vf : min_profitable_iters);
}
min_profitable_iters =
if (niters <= min_profitable_iters)
then skip the vectorized loop. */
min_profitable_iters--;
+
+ if (vect_print_dump_info (REPORT_DETAILS))
+ fprintf (vect_dump, " Profitability threshold = %d\n",
+ min_profitable_iters);
+
return min_profitable_iters;
}
/* Analyze cost to set threshhold for vectorized loop. */
min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
- min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
- * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+ * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
/* Use the cost model only if it is more conservative than user specified
threshold. */
|| min_profitable_iters > min_scalar_loop_bound))
th = (unsigned) min_profitable_iters;
- if (min_profitable_iters
- && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+ || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
&& vect_print_dump_info (REPORT_DETAILS))
fprintf (vect_dump, "vectorization may not be profitable.");