i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather cost correctly.

author Jan Hubicka <hubicka@ucw.cz>

Wed, 25 Oct 2017 19:11:41 +0000 (21:11 +0200)

committer Jan Hubicka <hubicka@gcc.gnu.org>

Wed, 25 Oct 2017 19:11:41 +0000 (19:11 +0000)
author Jan Hubicka <hubicka@ucw.cz>
Wed, 25 Oct 2017 19:11:41 +0000 (21:11 +0200)
committer Jan Hubicka <hubicka@gcc.gnu.org>
Wed, 25 Oct 2017 19:11:41 +0000 (19:11 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index c5dfcb704eb7a25e824e8e37c2c5476eba91aeb2..5985d9ed8c9bd692c82589af349a6e1c3528c393 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2017-10-23  Jan Hubicka  <hubicka@ucw.cz>
+
+       * i386.c (ix86_builtin_vectorization_cost): Compute scatter/gather
+       cost correctly.
+       * i386.h (processor_costs): Add gather_static, gather_per_elt,
+       scatter_static, scatter_per_elt.
+       * x86-tune-costs.h: Add new cost entries.
+
  2017-10-25  Richard Biener  <rguenther@suse.de>
  
         * tree-ssa-sccvn.h (vn_eliminate): Declare.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index 367cadea3c1303409a46cea1af08aa7b11f2f9ef..56486e049c7146c910bd9472c03ef5066b428973 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -44490,7 +44490,6 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
        /* We should have separate costs for unaligned loads and gather/scatter.
          Do that incrementally.  */
        case unaligned_load:
-      case vector_gather_load:
         index = sse_store_index (mode);
          return ix86_vec_cost (mode,
                               COSTS_N_INSNS
@@ -44498,13 +44497,28 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
                               true);
  
        case unaligned_store:
-      case vector_scatter_store:
         index = sse_store_index (mode);
          return ix86_vec_cost (mode,
                               COSTS_N_INSNS
                                  (ix86_cost->sse_unaligned_store[index]) / 2,
                               true);
  
+      case vector_gather_load:
+        return ix86_vec_cost (mode,
+                             COSTS_N_INSNS
+                                (ix86_cost->gather_static
+                                 + ix86_cost->gather_per_elt
+                                   * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+                             true);
+
+      case vector_scatter_store:
+        return ix86_vec_cost (mode,
+                             COSTS_N_INSNS
+                                (ix86_cost->scatter_static
+                                 + ix86_cost->scatter_per_elt
+                                   * TYPE_VECTOR_SUBPARTS (vectype)) / 2,
+                             true);
+
        case cond_branch_taken:
          return ix86_cost->cond_taken_branch_cost;
  
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index 27fc9f08cc73ef239b6e62c447e2f1c9204546c2..837906b51694da75e682110a6342d21f1620482e 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -253,6 +253,10 @@ struct processor_costs {
    const int mmxsse_to_integer; /* cost of moving mmxsse register to
                                    integer.  */
    const int ssemmx_to_integer;  /* cost of moving integer to mmxsse register. */
+  const int gather_static, gather_per_elt; /* Cost of gather load is computed
+                                  as static + per_item * nelts. */
+  const int scatter_static, scatter_per_elt; /* Cost of gather store is
+                                  computed as static + per_item * nelts.  */
    const int l1_cache_size;     /* size of l1 cache, in kilobytes.  */
    const int l2_cache_size;     /* size of l2 cache, in kilobytes.  */
    const int prefetch_block;    /* bytes moved to cache for prefetch.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h

index e31d7cef2ebf488975003a8d8779dcf94ec9ba63..c7ac70e8453e0336370652f2683a37418e95d52e 100644 (file)
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -82,6 +82,8 @@ struct processor_costs ix86_size_cost = {/* costs for tuning for size */
    {3, 3, 3, 3, 3},                             /* cost of unaligned SSE store
                                            in 128bit, 256bit and 512bit */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  5, 0,                                        /* Gather load static, per_elt.  */
+  5, 0,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
    0,                                   /* size of l2 cache  */
    0,                                   /* size of prefetch block */
@@ -166,6 +168,8 @@ struct processor_costs i386_cost = {        /* 386 specific costs */
                                            in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    0,                                   /* size of l1 cache  */
    0,                                   /* size of l2 cache  */
    0,                                   /* size of prefetch block */
@@ -249,6 +253,8 @@ struct processor_costs i486_cost = {        /* 486 specific costs */
                                            in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    4,                                   /* size of l1 cache.  486 has 8kB cache
                                            shared for code and data, so 4kB is
                                            not really precise.  */
@@ -334,6 +340,8 @@ struct processor_costs pentium_cost = {
                                            in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
    8,                                   /* size of l2 cache  */
    0,                                   /* size of prefetch block */
@@ -410,6 +418,8 @@ struct processor_costs lakemont_cost = {
                                            in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
    8,                                   /* size of l2 cache  */
    0,                                   /* size of prefetch block */
@@ -501,6 +511,8 @@ struct processor_costs pentiumpro_cost = {
                                            in 32,64,128,256 and 512-bit */
    {4, 8, 16, 32, 64},                  /* cost of unaligned stores.  */
    3, 3,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
    256,                                 /* size of l2 cache  */
    32,                                  /* size of prefetch block */
@@ -584,6 +596,8 @@ struct processor_costs geode_cost = {
                                            in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 2,                                        /* Gather load static, per_elt.  */
+  2, 2,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
    128,                                 /* size of l2 cache.  */
    32,                                  /* size of prefetch block */
@@ -666,6 +680,8 @@ struct processor_costs k6_cost = {
                                            in 32,64,128,256 and 512-bit */
    {2, 2, 8, 16, 32},                   /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE moves */
+  2, 2,                                        /* Gather load static, per_elt.  */
+  2, 2,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    32,                                  /* size of l2 cache.  Some models
                                            have integrated l2 cache, but
@@ -754,6 +770,8 @@ struct processor_costs athlon_cost = {
                                            in 32,64,128,256 and 512-bit */
    {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    5, 5,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
    256,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -844,6 +862,8 @@ struct processor_costs k8_cost = {
                                            in 32,64,128,256 and 512-bit */
    {4, 4, 5, 10, 20},                   /* cost of unaligned stores.  */
    5, 5,                                        /* SSE->integer and integer->SSE moves */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -946,6 +966,8 @@ struct processor_costs amdfam10_cost = {
                                                                1/1  1/1
                                             MOVD reg32, xmmreg Double FADD 3
                                                                1/1  1/1 */
+  4, 4,                                        /* Gather load static, per_elt.  */
+  4, 4,                                        /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1041,6 +1063,8 @@ const struct processor_costs bdver1_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
    16, 20,                              /* SSE->integer and integer->SSE moves */
+  12, 12,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
    2048,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1138,6 +1162,8 @@ const struct processor_costs bdver2_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
    16, 20,                              /* SSE->integer and integer->SSE moves */
+  12, 12,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
    2048,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1234,6 +1260,8 @@ struct processor_costs bdver3_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
    16, 20,                              /* SSE->integer and integer->SSE moves */
+  12, 12,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
    2048,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1329,6 +1357,8 @@ struct processor_costs bdver4_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 20, 30},                        /* cost of unaligned stores.  */
    16, 20,                              /* SSE->integer and integer->SSE moves */
+  12, 12,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    16,                                  /* size of l1 cache.  */
    2048,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1435,6 +1465,11 @@ struct processor_costs znver1_cost = {
                                            in 32,64,128,256 and 512-bit.  */
    {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
    6, 6,                                        /* SSE->integer and integer->SSE moves.  */
+  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
+     throughput 12.  Approx 9 uops do not depend on vector size and every load
+     is 7 uops.  */
+  18, 8,                               /* Gather load static, per_elt.  */
+  18, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block.  */
@@ -1539,6 +1574,8 @@ const struct processor_costs btver1_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 12, 24, 48},                        /* cost of unaligned stores.  */
    14, 14,                              /* SSE->integer and integer->SSE moves */
+  10, 10,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1624,6 +1661,8 @@ const struct processor_costs btver2_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 12, 24, 48},                        /* cost of unaligned stores.  */
    14, 14,                              /* SSE->integer and integer->SSE moves */
+  10, 10,                              /* Gather load static, per_elt.  */
+  10, 10,                              /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    2048,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1708,6 +1747,8 @@ struct processor_costs pentium4_cost = {
                                            in 32,64,128,256 and 512-bit */
    {32, 32, 32, 64, 128},               /* cost of unaligned stores.  */
    20, 12,                              /* SSE->integer and integer->SSE moves */
+  16, 16,                              /* Gather load static, per_elt.  */
+  16, 16,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
    256,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1795,6 +1836,8 @@ struct processor_costs nocona_cost = {
                                            in 32,64,128,256 and 512-bit */
    {24, 24, 24, 48, 96},                        /* cost of unaligned stores.  */
    20, 12,                              /* SSE->integer and integer->SSE moves */
+  12, 12,                              /* Gather load static, per_elt.  */
+  12, 12,                              /* Gather store static, per_elt.  */
    8,                                   /* size of l1 cache.  */
    1024,                                        /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1880,6 +1923,8 @@ struct processor_costs atom_cost = {
                                            in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
    8, 6,                                        /* SSE->integer and integer->SSE moves */
+  8, 8,                                        /* Gather load static, per_elt.  */
+  8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    256,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -1965,6 +2010,8 @@ struct processor_costs slm_cost = {
                                            in 32,64,128,256 and 512-bit */
    {16, 16, 16, 32, 64},                        /* cost of unaligned stores.  */
    8, 6,                                        /* SSE->integer and integer->SSE moves */
+  8, 8,                                        /* Gather load static, per_elt.  */
+  8, 8,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    256,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -2050,6 +2097,8 @@ struct processor_costs intel_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 10, 10},                        /* cost of unaligned loads.  */
    4, 4,                                        /* SSE->integer and integer->SSE moves */
+  6, 6,                                        /* Gather load static, per_elt.  */
+  6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    256,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -2142,6 +2191,8 @@ struct processor_costs generic_cost = {
                                            in 32,64,128,256 and 512-bit */
    {10, 10, 10, 15, 20},                        /* cost of unaligned storess.  */
    20, 20,                              /* SSE->integer and integer->SSE moves */
+  6, 6,                                        /* Gather load static, per_elt.  */
+  6, 6,                                        /* Gather store static, per_elt.  */
    32,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
@@ -2239,6 +2290,11 @@ struct processor_costs core_cost = {
                                            in 32,64,128,256 and 512-bit */
    {6, 6, 6, 6, 12},                    /* cost of unaligned stores.  */
    2, 2,                                        /* SSE->integer and integer->SSE moves */
+  /* VGATHERDPD is 7 uops, rec throughput 5, while VGATHERDPD is 9 uops,
+     rec. throughput 6.
+     So 5 uops statically and one uops per load.  */
+  10, 6,                               /* Gather load static, per_elt.  */
+  10, 6,                               /* Gather store static, per_elt.  */
    64,                                  /* size of l1 cache.  */
    512,                                 /* size of l2 cache.  */
    64,                                  /* size of prefetch block */
author	Jan Hubicka <hubicka@ucw.cz>
	Wed, 25 Oct 2017 19:11:41 +0000 (21:11 +0200)
committer	Jan Hubicka <hubicka@gcc.gnu.org>
	Wed, 25 Oct 2017 19:11:41 +0000 (19:11 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/config/i386/x86-tune-costs.h		patch \| blob \| history