--- /dev/null
+// { dg-do compile }
+// { dg-additional-options "-fvect-cost-model=cheap" }
+// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } }
+
+#include <stdint.h>
+#include <stdlib.h>
+
+inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
+{
+ return ((static_cast<uint32_t>(i0) << 24) |
+ (static_cast<uint32_t>(i1) << 16) |
+ (static_cast<uint32_t>(i2) << 8) |
+ (static_cast<uint32_t>(i3)));
+}
+
+inline uint32_t load_be(const uint8_t in[], size_t off)
+{
+ in += off * sizeof(uint32_t);
+ return make_uint32(in[0], in[1], in[2], in[3]);
+}
+
+template<typename T>
+inline void load_be(const uint8_t in[],
+ T& x0, T& x1, T& x2, T& x3,
+ T& x4, T& x5, T& x6, T& x7)
+{
+ x0 = load_be(in, 0);
+ x1 = load_be(in, 1);
+ x2 = load_be(in, 2);
+ x3 = load_be(in, 3);
+ x4 = load_be(in, 4);
+ x5 = load_be(in, 5);
+ x6 = load_be(in, 6);
+ x7 = load_be(in, 7);
+}
+
+inline void store_be(uint32_t in, uint8_t out[4])
+{
+ uint32_t o = __builtin_bswap32 (in);
+ __builtin_memcpy (out, &o, sizeof (uint32_t));
+}
+
+template<typename T>
+inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3,
+ T x4, T x5, T x6, T x7)
+{
+ store_be(x0, out + (0 * sizeof(T)));
+ store_be(x1, out + (1 * sizeof(T)));
+ store_be(x2, out + (2 * sizeof(T)));
+ store_be(x3, out + (3 * sizeof(T)));
+ store_be(x4, out + (4 * sizeof(T)));
+ store_be(x5, out + (5 * sizeof(T)));
+ store_be(x6, out + (6 * sizeof(T)));
+ store_be(x7, out + (7 * sizeof(T)));
+}
+
+#define BLOCK_SIZE 8
+void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
+{
+ const size_t blocks4 = blocks / 4;
+
+ for (size_t i = 0; i < blocks4; i++)
+ {
+ uint32_t L0, R0, L1, R1, L2, R2, L3, R3;
+ load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+
+ for(size_t r = 0; r != 32; ++r)
+ {
+ L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r];
+ L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r];
+ L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r];
+ L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r];
+
+ R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1];
+ R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1];
+ R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1];
+ R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1];
+ }
+
+ store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+ }
+}
+
+// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } }
}
}
+/* Comparator for the loop-index sorted cost vectors. */
+
+static int
+li_cost_vec_cmp (const void *a_, const void *b_)
+{
+ auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
+ auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
+ if (a->first < b->first)
+ return -1;
+ else if (a->first == b->first)
+ return 0;
+ return 1;
+}
+
/* Check if vectorization of the basic block is profitable for the
subgraph denoted by SLP_INSTANCES. */
unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
- void *vect_target_cost_data = init_cost (NULL);
-
/* Calculate scalar cost and sum the cost for the vector stmts
previously collected. */
- stmt_vector_for_cost scalar_costs;
- scalar_costs.create (0);
+ stmt_vector_for_cost scalar_costs = vNULL;
+ stmt_vector_for_cost vector_costs = vNULL;
hash_set<slp_tree> visited;
FOR_EACH_VEC_ELT (slp_instances, i, instance)
{
auto_vec<bool, 20> life;
life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
true);
+ if (SLP_INSTANCE_ROOT_STMT (instance))
+ record_stmt_cost (&scalar_costs, 1, scalar_stmt,
+ SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
vect_bb_slp_scalar_cost (bb_vinfo,
SLP_INSTANCE_TREE (instance),
&life, &scalar_costs, visited);
- add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+ vector_costs.safe_splice (instance->cost_vec);
instance->cost_vec.release ();
}
/* Unset visited flag. */
- stmt_info_for_cost *si;
- FOR_EACH_VEC_ELT (scalar_costs, i, si)
- gimple_set_visited (si->stmt_info->stmt, false);
+ stmt_info_for_cost *cost;
+ FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+ gimple_set_visited (cost->stmt_info->stmt, false);
- void *scalar_target_cost_data = init_cost (NULL);
- add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
- scalar_costs.release ();
- unsigned dummy;
- finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
- destroy_cost_data (scalar_target_cost_data);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
+
+ /* When costing non-loop vectorization we need to consider each covered
+ loop independently and make sure vectorization is profitable. For
+ now we assume a loop may be not entered or executed an arbitrary
+ number of iterations (??? static information can provide more
+ precise info here) which means we can simply cost each containing
+ loops stmts separately. */
+
+ /* First produce cost vectors sorted by loop index. */
+ auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+ li_scalar_costs (scalar_costs.length ());
+ auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+ li_vector_costs (vector_costs.length ());
+ FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+ {
+ unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+ li_scalar_costs.quick_push (std::make_pair (l, cost));
+ }
+ /* Use a random used loop as fallback in case the first vector_costs
+ entry does not have a stmt_info associated with it. */
+ unsigned l = li_scalar_costs[0].first;
+ FOR_EACH_VEC_ELT (vector_costs, i, cost)
+ {
+ /* We inherit from the previous COST, invariants, externals and
+ extracts immediately follow the cost for the related stmt. */
+ if (cost->stmt_info)
+ l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+ li_vector_costs.quick_push (std::make_pair (l, cost));
+ }
+ li_scalar_costs.qsort (li_cost_vec_cmp);
+ li_vector_costs.qsort (li_cost_vec_cmp);
+
+ /* Now cost the portions individually. */
+ unsigned vi = 0;
+ unsigned si = 0;
+ do
+ {
+ unsigned sl = li_scalar_costs[si].first;
+ unsigned vl = li_vector_costs[vi].first;
+ if (sl != vl)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Scalar %d and vector %d loop part do not "
+ "match up, skipping scalar part\n", sl, vl);
+ /* Skip the scalar part, assuming zero cost on the vector side. */
+ do
+ {
+ si++;
+ }
+ while (si < li_scalar_costs.length ()
+ && li_scalar_costs[si].first == sl);
+ continue;
+ }
- /* Complete the target-specific vector cost calculation. */
- finish_cost (vect_target_cost_data, &vec_prologue_cost,
- &vec_inside_cost, &vec_epilogue_cost);
- destroy_cost_data (vect_target_cost_data);
+ void *scalar_target_cost_data = init_cost (NULL);
+ do
+ {
+ add_stmt_cost (bb_vinfo, scalar_target_cost_data,
+ li_scalar_costs[si].second);
+ si++;
+ }
+ while (si < li_scalar_costs.length ()
+ && li_scalar_costs[si].first == sl);
+ unsigned dummy;
+ finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+ destroy_cost_data (scalar_target_cost_data);
+
+ /* Complete the target-specific vector cost calculation. */
+ void *vect_target_cost_data = init_cost (NULL);
+ do
+ {
+ add_stmt_cost (bb_vinfo, vect_target_cost_data,
+ li_vector_costs[vi].second);
+ vi++;
+ }
+ while (vi < li_vector_costs.length ()
+ && li_vector_costs[vi].first == vl);
+ finish_cost (vect_target_cost_data, &vec_prologue_cost,
+ &vec_inside_cost, &vec_epilogue_cost);
+ destroy_cost_data (vect_target_cost_data);
- vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
+ vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
- if (dump_enabled_p ())
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Cost model analysis for part in loop %d:\n", sl);
+ dump_printf (MSG_NOTE, " Vector cost: %d\n",
+ vec_inside_cost + vec_outside_cost);
+ dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
+ }
+
+ /* Vectorization is profitable if its cost is more than the cost of scalar
+ version. Note that we err on the vector side for equal cost because
+ the cost estimate is otherwise quite pessimistic (constant uses are
+ free on the scalar side but cost a load on the vector side for
+ example). */
+ if (vec_outside_cost + vec_inside_cost > scalar_cost)
+ {
+ scalar_costs.release ();
+ vector_costs.release ();
+ return false;
+ }
+ }
+ while (si < li_scalar_costs.length ()
+ && vi < li_vector_costs.length ());
+ if (vi < li_vector_costs.length ())
{
- dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
- dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n",
- vec_inside_cost);
- dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost);
- dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost);
- dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Excess vector cost for part in loop %d:\n",
+ li_vector_costs[vi].first);
+ scalar_costs.release ();
+ vector_costs.release ();
+ return false;
}
- /* Vectorization is profitable if its cost is more than the cost of scalar
- version. Note that we err on the vector side for equal cost because
- the cost estimate is otherwise quite pessimistic (constant uses are
- free on the scalar side but cost a load on the vector side for
- example). */
- if (vec_outside_cost + vec_inside_cost > scalar_cost)
- return false;
-
+ scalar_costs.release ();
+ vector_costs.release ();
return true;
}