From a4fe6139ab2e39d8b264befaf38f748e5c88d76a Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Wed, 25 Oct 2017 21:11:41 +0200 Subject: [PATCH] i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather cost correctly. * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather cost correctly. * i386.h (processor_costs): Add gather_static, gather_per_elt, scatter_static, scatter_per_elt. * x86-tune-costs.h: Add new cost entries. From-SVN: r254083 --- gcc/ChangeLog | 8 +++++ gcc/config/i386/i386.c | 18 ++++++++-- gcc/config/i386/i386.h | 4 +++ gcc/config/i386/x86-tune-costs.h | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c5dfcb704eb..5985d9ed8c9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2017-10-23 Jan Hubicka + + * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather + cost correctly. + * i386.h (processor_costs): Add gather_static, gather_per_elt, + scatter_static, scatter_per_elt. + * x86-tune-costs.h: Add new cost entries. + 2017-10-25 Richard Biener * tree-ssa-sccvn.h (vn_eliminate): Declare. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 367cadea3c1..56486e049c7 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, /* We should have separate costs for unaligned loads and gather/scatter. Do that incrementally. */ case unaligned_load: - case vector_gather_load: index = sse_store_index (mode); return ix86_vec_cost (mode, COSTS_N_INSNS @@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, true); case unaligned_store: - case vector_scatter_store: index = sse_store_index (mode); return ix86_vec_cost (mode, COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2, true); + case vector_gather_load: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->gather_static + + ix86_cost->gather_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, + true); + + case vector_scatter_store: + return ix86_vec_cost (mode, + COSTS_N_INSNS + (ix86_cost->scatter_static + + ix86_cost->scatter_per_elt + * TYPE_VECTOR_SUBPARTS (vectype)) / 2, + true); + case cond_branch_taken: return ix86_cost->cond_taken_branch_cost; diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 27fc9f08cc7..837906b5169 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -253,6 +253,10 @@ struct processor_costs { const int mmxsse_to_integer; /* cost of moving mmxsse register to integer. */ const int ssemmx_to_integer; /* cost of moving integer to mmxsse register. */ + const int gather_static, gather_per_elt; /* Cost of gather load is computed + as static + per_item * nelts. */ + const int scatter_static, scatter_per_elt; /* Cost of gather store is + computed as static + per_item * nelts. */ const int l1_cache_size; /* size of l1 cache, in kilobytes. */ const int l2_cache_size; /* size of l2 cache, in kilobytes. */ const int prefetch_block; /* bytes moved to cache for prefetch. */ diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index e31d7cef2eb..c7ac70e8453 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */ {3, 3, 3, 3, 3}, /* cost of unaligned SSE store in 128bit, 256bit and 512bit */ 3, 3, /* SSE->integer and integer->SSE moves */ + 5, 0, /* Gather load static, per_elt. */ + 5, 0, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ 0, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -166,6 +168,8 @@ struct processor_costs i386_cost = { /* 386 specific costs */ in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 3, 3, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 0, /* size of l1 cache */ 0, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -249,6 +253,8 @@ struct processor_costs i486_cost = { /* 486 specific costs */ in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 3, 3, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 4, /* size of l1 cache. 486 has 8kB cache shared for code and data, so 4kB is not really precise. */ @@ -334,6 +340,8 @@ struct processor_costs pentium_cost = { in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 3, 3, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ 8, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = { in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 3, 3, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ 8, /* size of l2 cache */ 0, /* size of prefetch block */ @@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost = { in 32,64,128,256 and 512-bit */ {4, 8, 16, 32, 64}, /* cost of unaligned stores. */ 3, 3, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ 256, /* size of l2 cache */ 32, /* size of prefetch block */ @@ -584,6 +596,8 @@ struct processor_costs geode_cost = { in 32,64,128,256 and 512-bit */ {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves */ + 2, 2, /* Gather load static, per_elt. */ + 2, 2, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ 128, /* size of l2 cache. */ 32, /* size of prefetch block */ @@ -666,6 +680,8 @@ struct processor_costs k6_cost = { in 32,64,128,256 and 512-bit */ {2, 2, 8, 16, 32}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves */ + 2, 2, /* Gather load static, per_elt. */ + 2, 2, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 32, /* size of l2 cache. Some models have integrated l2 cache, but @@ -754,6 +770,8 @@ struct processor_costs athlon_cost = { in 32,64,128,256 and 512-bit */ {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 5, 5, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -844,6 +862,8 @@ struct processor_costs k8_cost = { in 32,64,128,256 and 512-bit */ {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ 5, 5, /* SSE->integer and integer->SSE moves */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = { 1/1 1/1 MOVD reg32, xmmreg Double FADD 3 1/1 1/1 */ + 4, 4, /* Gather load static, per_elt. */ + 4, 4, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ + 12, 12, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ + 12, 12, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ + 12, 12, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ 16, 20, /* SSE->integer and integer->SSE moves */ + 12, 12, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 16, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = { in 32,64,128,256 and 512-bit. */ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 6, 6, /* SSE->integer and integer->SSE moves. */ + /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, + throughput 12. Approx 9 uops do not depend on vector size and every load + is 7 uops. */ + 18, 8, /* Gather load static, per_elt. */ + 18, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block. */ @@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 14, 14, /* SSE->integer and integer->SSE moves */ + 10, 10, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ 14, 14, /* SSE->integer and integer->SSE moves */ + 10, 10, /* Gather load static, per_elt. */ + 10, 10, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 2048, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = { in 32,64,128,256 and 512-bit */ {32, 32, 32, 64, 128}, /* cost of unaligned stores. */ 20, 12, /* SSE->integer and integer->SSE moves */ + 16, 16, /* Gather load static, per_elt. */ + 16, 16, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = { in 32,64,128,256 and 512-bit */ {24, 24, 24, 48, 96}, /* cost of unaligned stores. */ 20, 12, /* SSE->integer and integer->SSE moves */ + 12, 12, /* Gather load static, per_elt. */ + 12, 12, /* Gather store static, per_elt. */ 8, /* size of l1 cache. */ 1024, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = { in 32,64,128,256 and 512-bit */ {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 8, 6, /* SSE->integer and integer->SSE moves */ + 8, 8, /* Gather load static, per_elt. */ + 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = { in 32,64,128,256 and 512-bit */ {16, 16, 16, 32, 64}, /* cost of unaligned stores. */ 8, 6, /* SSE->integer and integer->SSE moves */ + 8, 8, /* Gather load static, per_elt. */ + 8, 8, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 10, 10}, /* cost of unaligned loads. */ 4, 4, /* SSE->integer and integer->SSE moves */ + 6, 6, /* Gather load static, per_elt. */ + 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 256, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = { in 32,64,128,256 and 512-bit */ {10, 10, 10, 15, 20}, /* cost of unaligned storess. */ 20, 20, /* SSE->integer and integer->SSE moves */ + 6, 6, /* Gather load static, per_elt. */ + 6, 6, /* Gather store static, per_elt. */ 32, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ @@ -2239,6 +2290,11 @@ struct processor_costs core_cost = { in 32,64,128,256 and 512-bit */ {6, 6, 6, 6, 12}, /* cost of unaligned stores. */ 2, 2, /* SSE->integer and integer->SSE moves */ + /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops, + rec. throughput 6. + So 5 uops statically and one uops per load. */ + 10, 6, /* Gather load static, per_elt. */ + 10, 6, /* Gather store static, per_elt. */ 64, /* size of l1 cache. */ 512, /* size of l2 cache. */ 64, /* size of prefetch block */ -- 2.30.2