From: Richard Biener Date: Fri, 5 Feb 2021 08:54:00 +0000 (+0100) Subject: tree-optimization/98855 - redo BB vectorization costing X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=63538886d1f7fc7cbf066b4c2d6d7fd4da537259;p=gcc.git tree-optimization/98855 - redo BB vectorization costing The following attempts to account for the fact that BB vectorization regions now can span multiple loop levels and that an unprofitable inner loop vectorization shouldn't be offsetted by a profitable outer loop vectorization to make it overall profitable. For now I've implemented a heuristic based on the premise that vectorization should be profitable even if loops may not be entered or if they iterate any number of times. Especially the first assumption then requires that stmts directly belonging to loop A need to be costed separately from stmts belonging to another loop which also simplifies the implementation. On x86 the added testcase has in the outer loop t.c:38:20: note: Cost model analysis for part in loop 1: Vector cost: 56 Scalar cost: 192 and the inner loop t.c:38:20: note: Cost model analysis for part in loop 2: Vector cost: 132 Scalar cost: 48 and thus the vectorization is considered not profitable (note the same would happen in case the 2nd cost were for a loop outer to the 1st costing). Future enhancements may consider static knowledge of whether a loop is always entered which would allow some inefficiency in the vectorization of its loop header. Likewise stmts only reachable from a loop exit can be treated this way. 2021-02-05 Richard Biener PR tree-optimization/98855 * tree-vectorizer.h (add_stmt_cost): New overload. * tree-vect-slp.c (li_cost_vec_cmp): New. (vect_bb_slp_scalar_cost): Cost individual loop regions separately. Account for the scalar instance root stmt. * g++.dg/vect/slp-pr98855.cc: New testcase. --- diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc new file mode 100644 index 00000000000..0b4e479b513 --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc @@ -0,0 +1,84 @@ +// { dg-do compile } +// { dg-additional-options "-fvect-cost-model=cheap" } +// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } + +#include +#include + +inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3) +{ + return ((static_cast(i0) << 24) | + (static_cast(i1) << 16) | + (static_cast(i2) << 8) | + (static_cast(i3))); +} + +inline uint32_t load_be(const uint8_t in[], size_t off) +{ + in += off * sizeof(uint32_t); + return make_uint32(in[0], in[1], in[2], in[3]); +} + +template +inline void load_be(const uint8_t in[], + T& x0, T& x1, T& x2, T& x3, + T& x4, T& x5, T& x6, T& x7) +{ + x0 = load_be(in, 0); + x1 = load_be(in, 1); + x2 = load_be(in, 2); + x3 = load_be(in, 3); + x4 = load_be(in, 4); + x5 = load_be(in, 5); + x6 = load_be(in, 6); + x7 = load_be(in, 7); +} + +inline void store_be(uint32_t in, uint8_t out[4]) +{ + uint32_t o = __builtin_bswap32 (in); + __builtin_memcpy (out, &o, sizeof (uint32_t)); +} + +template +inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3, + T x4, T x5, T x6, T x7) +{ + store_be(x0, out + (0 * sizeof(T))); + store_be(x1, out + (1 * sizeof(T))); + store_be(x2, out + (2 * sizeof(T))); + store_be(x3, out + (3 * sizeof(T))); + store_be(x4, out + (4 * sizeof(T))); + store_be(x5, out + (5 * sizeof(T))); + store_be(x6, out + (6 * sizeof(T))); + store_be(x7, out + (7 * sizeof(T))); +} + +#define BLOCK_SIZE 8 +void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK) +{ + const size_t blocks4 = blocks / 4; + + for (size_t i = 0; i < blocks4; i++) + { + uint32_t L0, R0, L1, R1, L2, R2, L3, R3; + load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + + for(size_t r = 0; r != 32; ++r) + { + L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r]; + L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r]; + L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r]; + L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r]; + + R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1]; + R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1]; + R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1]; + R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1]; + } + + store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3); + } +} + +// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } } diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 2305bbdec3a..b9f12c30fb8 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -4340,6 +4340,20 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, } } +/* Comparator for the loop-index sorted cost vectors. */ + +static int +li_cost_vec_cmp (const void *a_, const void *b_) +{ + auto *a = (const std::pair *)a_; + auto *b = (const std::pair *)b_; + if (a->first < b->first) + return -1; + else if (a->first == b->first) + return 0; + return 1; +} + /* Check if vectorization of the basic block is profitable for the subgraph denoted by SLP_INSTANCES. */ @@ -4352,61 +4366,152 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; - void *vect_target_cost_data = init_cost (NULL); - /* Calculate scalar cost and sum the cost for the vector stmts previously collected. */ - stmt_vector_for_cost scalar_costs; - scalar_costs.create (0); + stmt_vector_for_cost scalar_costs = vNULL; + stmt_vector_for_cost vector_costs = vNULL; hash_set visited; FOR_EACH_VEC_ELT (slp_instances, i, instance) { auto_vec life; life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)), true); + if (SLP_INSTANCE_ROOT_STMT (instance)) + record_stmt_cost (&scalar_costs, 1, scalar_stmt, + SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body); vect_bb_slp_scalar_cost (bb_vinfo, SLP_INSTANCE_TREE (instance), &life, &scalar_costs, visited); - add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec); + vector_costs.safe_splice (instance->cost_vec); instance->cost_vec.release (); } /* Unset visited flag. */ - stmt_info_for_cost *si; - FOR_EACH_VEC_ELT (scalar_costs, i, si) - gimple_set_visited (si->stmt_info->stmt, false); + stmt_info_for_cost *cost; + FOR_EACH_VEC_ELT (scalar_costs, i, cost) + gimple_set_visited (cost->stmt_info->stmt, false); - void *scalar_target_cost_data = init_cost (NULL); - add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs); - scalar_costs.release (); - unsigned dummy; - finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy); - destroy_cost_data (scalar_target_cost_data); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); + + /* When costing non-loop vectorization we need to consider each covered + loop independently and make sure vectorization is profitable. For + now we assume a loop may be not entered or executed an arbitrary + number of iterations (??? static information can provide more + precise info here) which means we can simply cost each containing + loops stmts separately. */ + + /* First produce cost vectors sorted by loop index. */ + auto_vec > + li_scalar_costs (scalar_costs.length ()); + auto_vec > + li_vector_costs (vector_costs.length ()); + FOR_EACH_VEC_ELT (scalar_costs, i, cost) + { + unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num; + li_scalar_costs.quick_push (std::make_pair (l, cost)); + } + /* Use a random used loop as fallback in case the first vector_costs + entry does not have a stmt_info associated with it. */ + unsigned l = li_scalar_costs[0].first; + FOR_EACH_VEC_ELT (vector_costs, i, cost) + { + /* We inherit from the previous COST, invariants, externals and + extracts immediately follow the cost for the related stmt. */ + if (cost->stmt_info) + l = gimple_bb (cost->stmt_info->stmt)->loop_father->num; + li_vector_costs.quick_push (std::make_pair (l, cost)); + } + li_scalar_costs.qsort (li_cost_vec_cmp); + li_vector_costs.qsort (li_cost_vec_cmp); + + /* Now cost the portions individually. */ + unsigned vi = 0; + unsigned si = 0; + do + { + unsigned sl = li_scalar_costs[si].first; + unsigned vl = li_vector_costs[vi].first; + if (sl != vl) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Scalar %d and vector %d loop part do not " + "match up, skipping scalar part\n", sl, vl); + /* Skip the scalar part, assuming zero cost on the vector side. */ + do + { + si++; + } + while (si < li_scalar_costs.length () + && li_scalar_costs[si].first == sl); + continue; + } - /* Complete the target-specific vector cost calculation. */ - finish_cost (vect_target_cost_data, &vec_prologue_cost, - &vec_inside_cost, &vec_epilogue_cost); - destroy_cost_data (vect_target_cost_data); + void *scalar_target_cost_data = init_cost (NULL); + do + { + add_stmt_cost (bb_vinfo, scalar_target_cost_data, + li_scalar_costs[si].second); + si++; + } + while (si < li_scalar_costs.length () + && li_scalar_costs[si].first == sl); + unsigned dummy; + finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy); + destroy_cost_data (scalar_target_cost_data); + + /* Complete the target-specific vector cost calculation. */ + void *vect_target_cost_data = init_cost (NULL); + do + { + add_stmt_cost (bb_vinfo, vect_target_cost_data, + li_vector_costs[vi].second); + vi++; + } + while (vi < li_vector_costs.length () + && li_vector_costs[vi].first == vl); + finish_cost (vect_target_cost_data, &vec_prologue_cost, + &vec_inside_cost, &vec_epilogue_cost); + destroy_cost_data (vect_target_cost_data); - vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; + vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; - if (dump_enabled_p ()) + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_NOTE, vect_location, + "Cost model analysis for part in loop %d:\n", sl); + dump_printf (MSG_NOTE, " Vector cost: %d\n", + vec_inside_cost + vec_outside_cost); + dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost); + } + + /* Vectorization is profitable if its cost is more than the cost of scalar + version. Note that we err on the vector side for equal cost because + the cost estimate is otherwise quite pessimistic (constant uses are + free on the scalar side but cost a load on the vector side for + example). */ + if (vec_outside_cost + vec_inside_cost > scalar_cost) + { + scalar_costs.release (); + vector_costs.release (); + return false; + } + } + while (si < li_scalar_costs.length () + && vi < li_vector_costs.length ()); + if (vi < li_vector_costs.length ()) { - dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); - dump_printf (MSG_NOTE, " Vector inside of basic block cost: %d\n", - vec_inside_cost); - dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", vec_prologue_cost); - dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", vec_epilogue_cost); - dump_printf (MSG_NOTE, " Scalar cost of basic block: %d\n", scalar_cost); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Excess vector cost for part in loop %d:\n", + li_vector_costs[vi].first); + scalar_costs.release (); + vector_costs.release (); + return false; } - /* Vectorization is profitable if its cost is more than the cost of scalar - version. Note that we err on the vector side for equal cost because - the cost estimate is otherwise quite pessimistic (constant uses are - free on the scalar side but cost a load on the vector side for - example). */ - if (vec_outside_cost + vec_inside_cost > scalar_cost) - return false; - + scalar_costs.release (); + vector_costs.release (); return true; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e564fcf835a..b861c97ab3a 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1481,6 +1481,15 @@ add_stmt_cost (vec_info *vinfo, void *data, int count, return cost; } +/* Alias targetm.vectorize.add_stmt_cost. */ + +static inline unsigned +add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i) +{ + return add_stmt_cost (vinfo, data, i->count, i->kind, i->stmt_info, + i->vectype, i->misalign, i->where); +} + /* Alias targetm.vectorize.finish_cost. */ static inline void