From: Richard Biener Date: Fri, 16 Feb 2018 13:47:25 +0000 (+0000) Subject: re PR tree-optimization/84037 (Speed regression of polyhedron benchmark since r256644) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=be77ba2a461eefdf4a2676b19025f36ec092c598;p=gcc.git re PR tree-optimization/84037 (Speed regression of polyhedron benchmark since r256644) 2018-02-16 Richard Biener PR tree-optimization/84037 PR tree-optimization/84016 PR target/82862 * config/i386/i386.c (ix86_builtin_vectorization_cost): Adjust vec_construct for the fact we need additional higher latency 128bit inserts for AVX256 and AVX512 vector builds. (ix86_add_stmt_cost): Scale vector construction cost for elementwise loads. From-SVN: r257734 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c43637aac5e..01ca398965e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2018-02-16 Richard Biener + + PR tree-optimization/84037 + PR tree-optimization/84016 + PR target/82862 + * config/i386/i386.c (ix86_builtin_vectorization_cost): + Adjust vec_construct for the fact we need additional higher latency + 128bit inserts for AVX256 and AVX512 vector builds. + (ix86_add_stmt_cost): Scale vector construction cost for + elementwise loads. + 2018-02-16 Richard Biener PR tree-optimization/84417 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2e82842c022..4a968a75a9e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -45906,7 +45906,18 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, ix86_cost->sse_op, true); case vec_construct: - return ix86_vec_cost (mode, ix86_cost->sse_op, false); + { + /* N element inserts. */ + int cost = ix86_vec_cost (mode, ix86_cost->sse_op, false); + /* One vinserti128 for combining two SSE vectors for AVX256. */ + if (GET_MODE_BITSIZE (mode) == 256) + cost += ix86_vec_cost (mode, ix86_cost->addss, true); + /* One vinserti64x4 and two vinserti128 for combining SSE + and AVX256 vectors to AVX512. */ + else if (GET_MODE_BITSIZE (mode) == 512) + cost += 3 * ix86_vec_cost (mode, ix86_cost->addss, true); + return cost; + } default: gcc_unreachable (); @@ -50245,6 +50256,18 @@ ix86_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, break; } } + /* If we do elementwise loads into a vector then we are bound by + latency and execution resources for the many scalar loads + (AGU and load ports). Try to account for this by scaling the + construction cost by the number of elements involved. */ + if (kind == vec_construct + && stmt_info + && stmt_info->type == load_vec_info_type + && stmt_info->memory_access_type == VMAT_ELEMENTWISE) + { + stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign); + stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype); + } if (stmt_cost == -1) stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);