rsha Jagasia <harsha.jagasia@amd.com>

author Harsha Jagasia <harsha.jagasia@amd.com>

Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)

committer Harsha Jagasia <hjagasia@gcc.gnu.org>

Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
author Harsha Jagasia <harsha.jagasia@amd.com>
Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
committer Harsha Jagasia <hjagasia@gcc.gnu.org>
Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 9bcfca14751ab7038dc004857d82b6fccf817748..ab75d14b9122ee9f840e2f69d42f9f8427c1c63d 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,47 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+            Jan Sjodin <jan.sjodin@amd.com>
+       
+       * tree-vect-analyze.c (vect_analyze_operations): Change
+       comparison of loop iterations with threshold to less than
+       or equal to instead of less than. Reduce
+       min_scalar_loop_bound by one.
+       * tree-vect-transform.c (vect_estimate_min_profitable_iters): 
+       Change prologue and epilogue iterations estimate to vf/2,
+       when unknown at compile-time. Change versioning guard
+       cost to taken_branch_cost. If peeling for alignment is
+       unknown at compile-time, change peel guard costs to one
+       taken branch and one not-taken branch per peeled loop.
+       If peeling for alignment is known but number of scalar loop
+       iterations is unknown at compile-time, change peel guard
+       costs to one taken branch per peeled loop. Change the cost
+       model equation to consider vector iterations as the loop
+       iterations less the prologue and epilogue iterations.
+       Change outside vector cost check to less than or equal to
+       zero instead of equal to zero.
+       (vect_do_peeling_for_loop_bound): Reduce
+       min_scalar_loop_bound by one.
+       * tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
+       TARG_COND_NOT_TAKEN_BRANCH_COST.        
+       * config/i386/i386.h (processor_costs): Add
+       scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
+       vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
+       vec_align_load_cost, vect_unalign_load_cost,
+       vec_store_cost, cond_taken_branch_cost,
+       cond_not_taken_branch_cost.
+       Define macros for x86 costs.
+       * config/i386/i386.c:
+       (size_cost): Set scalar_stmt_cost, scalar_load_cost,
+       scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
+       scalar_to_vec_cost, vec_align_load_cost, 
+       vect_unalign_load_cost, vec_store_cost,
+       cond_taken_branch_cost, cond_not_taken_branch_cost to one. 
+       (i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
+       geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost, 
+       core2_cost, generic64_cost, generic32_cost): Set to default
+       untuned costs.
+       (k8_cost, amdfam10_cost): Costs for vectorization tuned.
+       (x86_builtin_vectorization_cost): New.
+
  2007-09-10  Janis Johnson  <janis187@us.ibm.com>
             Ben Elliston  <bje@au.ibm.com>
  
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c

index b1dda95adc7bd1206e1267b16005912aa3b3781b..ce7c19b46ec59ecbd5b6eb77e4558f0c64d289cd 100644 (file)
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -52,6 +52,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tm-constrs.h"
  #include "params.h"
  
+static int x86_builtin_vectorization_cost (bool);
+
  #ifndef CHECK_STACK_LIMIT
  #define CHECK_STACK_LIMIT (-1)
  #endif
@@ -126,7 +128,18 @@ struct processor_costs size_cost = {       /* costs for tuning for size */
    {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
     {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
    {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
-   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
+   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  1,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  1,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  /* Processor costs (relative to an add) */
@@ -187,6 +200,17 @@ struct processor_costs i386_cost = {       /* 386 specific costs */
     DUMMY_STRINGOP_ALGS},
    {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
     DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -247,7 +271,18 @@ struct processor_costs i486_cost = {       /* 486 specific costs */
    {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
     DUMMY_STRINGOP_ALGS},
    {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -306,7 +341,18 @@ struct processor_costs pentium_cost = {
    {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
    {{libcall, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -372,7 +418,18 @@ struct processor_costs pentiumpro_cost = {
     DUMMY_STRINGOP_ALGS},
    {{rep_prefix_4_byte, {{1024, unrolled_loop},
                         {8192, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -432,7 +489,18 @@ struct processor_costs geode_cost = {
    {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
    {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -494,7 +562,18 @@ struct processor_costs k6_cost = {
    {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
    {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -556,7 +635,18 @@ struct processor_costs athlon_cost = {
    {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
    {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -624,7 +714,18 @@ struct processor_costs k8_cost = {
     {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
    {{libcall, {{8, loop}, {24, unrolled_loop},
               {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  5,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  3,                                    /* vec_unalign_load_cost.  */
+  3,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  struct processor_costs amdfam10_cost = {
@@ -700,7 +801,18 @@ struct processor_costs amdfam10_cost = {
     {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
    {{libcall, {{8, loop}, {24, unrolled_loop},
               {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  6,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  2,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -761,6 +873,17 @@ struct processor_costs pentium4_cost = {
    {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
     {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -822,7 +945,18 @@ struct processor_costs nocona_cost = {
    {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
     {-1, libcall}}},
     {libcall, {{24, loop}, {64, unrolled_loop},
-             {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+             {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  static const
@@ -883,7 +1017,18 @@ struct processor_costs core2_cost = {
    {{libcall, {{8, loop}, {15, unrolled_loop},
               {2048, rep_prefix_4_byte}, {-1, libcall}}},
     {libcall, {{24, loop}, {32, unrolled_loop},
-             {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+             {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  /* Generic64 should produce code tuned for Nocona and K8.  */
@@ -949,7 +1094,18 @@ struct processor_costs generic64_cost = {
    {DUMMY_STRINGOP_ALGS,
     {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
    {DUMMY_STRINGOP_ALGS,
-   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
@@ -1010,6 +1166,17 @@ struct processor_costs generic32_cost = {
     DUMMY_STRINGOP_ALGS},
    {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
     DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
  };
  
  const struct processor_costs *ix86_cost = &pentium_cost;
@@ -23615,6 +23782,30 @@ static const struct attribute_spec ix86_attribute_table[] =
    { NULL,        0, 0, false, false, false, NULL }
  };
  
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int 
+x86_builtin_vectorization_cost (bool runtime_test)
+{
+  /* If the branch of the runtime test is taken - i.e. - the vectorized
+     version is skipped - this incurs a misprediction cost (because the
+     vectorized version is expected to be the fall-through).  So we subtract
+     the latency of a mispredicted branch from the costs that are incured
+     when the vectorized version is executed.
+
+     TODO: The values in individual target tables have to be tuned or new
+     fields may be needed. For eg. on K8, the default branch path is the
+     not-taken path. If the taken path is predicted correctly, the minimum
+     penalty of going down the taken-path is 1 cycle. If the taken-path is
+     not predicted correctly, then the minimum penalty is 10 cycles.  */
+
+  if (runtime_test)
+    {
+      return (-(ix86_cost->cond_taken_branch_cost));
+    }
+  else
+    return 0;
+}
+
  /* Initialize the GCC target structure.  */
  #undef TARGET_ATTRIBUTE_TABLE
  #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
@@ -23791,6 +23982,9 @@ static const struct attribute_spec ix86_attribute_table[] =
  #undef TARGET_FUNCTION_VALUE
  #define TARGET_FUNCTION_VALUE ix86_function_value
  
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+
  struct gcc_target targetm = TARGET_INITIALIZER;
  \f
  #include "gt-i386.h"
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h

index e18e90f600cd4be3b9a73f0ae6b8700c0a655b26..a14c74b101d583c3bfa3e6c5730f1b3666a3eb3d 100644 (file)
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -138,6 +138,22 @@ struct processor_costs {
                                 /* Specify what algorithm
                                    to use for stringops on unknown size.  */
    struct stringop_algs memcpy[2], memset[2];
+  const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
+                                  load and store.  */
+  const int scalar_load_cost;   /* Cost of scalar load.  */
+  const int scalar_store_cost;  /* Cost of scalar store.  */
+  const int vec_stmt_cost;      /* Cost of any vector operation, excluding
+                                   load, store, vector-to-scalar and
+                                   scalar-to-vector operation.  */
+  const int vec_to_scalar_cost;    /* Cost of vect-to-scalar operation.  */
+  const int scalar_to_vec_cost;    /* Cost of scalar-to-vector operation.  */
+  const int vec_align_load_cost;   /* Cost of aligned vector load.  */ 
+  const int vec_unalign_load_cost; /* Cost of unaligned vector load.  */
+  const int vec_store_cost;        /* Cost of vector store.  */
+  const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
+                                         cost model.  */
+  const int cond_not_taken_branch_cost;/* Cost of not taken branch for
+                                         vectorizer cost model.  */
  };
  
  extern const struct processor_costs *ix86_cost;
@@ -2460,6 +2476,57 @@ struct machine_function GTY(())
  #define SYMBOL_REF_DLLEXPORT_P(X) \
         ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0)
  
+/* Model costs for vectorizer.  */
+
+/* Cost of conditional branch.  */
+#undef TARG_COND_BRANCH_COST
+#define TARG_COND_BRANCH_COST           ix86_cost->branch_cost
+
+/* Cost of any scalar operation, excluding load and store.  */
+#undef TARG_SCALAR_STMT_COST
+#define TARG_SCALAR_STMT_COST           ix86_cost->scalar_stmt_cost
+
+/* Cost of scalar load.  */
+#undef TARG_SCALAR_LOAD_COST
+#define TARG_SCALAR_LOAD_COST           ix86_cost->scalar_load_cost
+
+/* Cost of scalar store.  */
+#undef TARG_SCALAR_STORE_COST
+#define TARG_SCALAR_STORE_COST          ix86_cost->scalar_store_cost
+
+/* Cost of any vector operation, excluding load, store or vector to scalar
+   operation.  */ 
+#undef TARG_VEC_STMT_COST
+#define TARG_VEC_STMT_COST              ix86_cost->vec_stmt_cost
+
+/* Cost of vector to scalar operation.  */
+#undef TARG_VEC_TO_SCALAR_COST
+#define TARG_VEC_TO_SCALAR_COST         ix86_cost->vec_to_scalar_cost
+
+/* Cost of scalar to vector operation.  */
+#undef TARG_SCALAR_TO_VEC_COST
+#define TARG_SCALAR_TO_VEC_COST         ix86_cost->scalar_to_vec_cost
+
+/* Cost of aligned vector load.  */
+#undef TARG_VEC_LOAD_COST
+#define TARG_VEC_LOAD_COST              ix86_cost->vec_align_load_cost
+
+/* Cost of misaligned vector load.  */
+#undef TARG_VEC_UNALIGNED_LOAD_COST
+#define TARG_VEC_UNALIGNED_LOAD_COST    ix86_cost->vec_unalign_load_cost
+
+/* Cost of vector store.  */
+#undef TARG_VEC_STORE_COST
+#define TARG_VEC_STORE_COST             ix86_cost->vec_store_cost
+
+/* Cost of conditional taken branch for vectorizer cost model.  */
+#undef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST     ix86_cost->cond_taken_branch_cost
+
+/* Cost of conditional not taken branch for vectorizer cost model.  */
+#undef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST ix86_cost->cond_not_taken_branch_cost
+
  /*
  Local variables:
  version-control: t
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 792984e9ae39e6abd805fd98c0be6c09592a1de2..aca143371ff4cad73110b0582d365ad9e027cda6 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,16 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+
+        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c: 
+       Change dg-final to expect 1 non-profitable loop and
+       3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
+       Change dg-final to expect 1 non-profitable loop and
+       3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
+       Change dg-final to expect 1 profitable loop.
+        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
+       Change dg-final to expect 1 profitable loop.    
+       
  2007-09-10  Richard Sandiford  <richard@codesourcery.com>
  
         * gcc.target/mips/call-saved-1.c: New test.
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c

index 5d84017dd240e485bfe8b9ae83230cc551987d57..2766ced28715ed0be2f12a8e3e077c569b16389a 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
@@ -35,6 +35,6 @@ int main()
     return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
  
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c

index 11e8a29eada5b2356a6b7e6b2a41ac34a29b81e9..efab30d4ac6c5a08c7d9766f996b100c5518d444 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
@@ -85,7 +85,7 @@ int main (void)
    return main1 ();
  } 
  
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
   */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c

index 9a9f97e802bee532d950d4e8b4608747cb76afe2..0bc09f6da32eed09e0105e9a5f6888e2795186d8 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
@@ -35,6 +35,6 @@ int main()
     return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
  
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c

index 11e8a29eada5b2356a6b7e6b2a41ac34a29b81e9..efab30d4ac6c5a08c7d9766f996b100c5518d444 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
@@ -85,7 +85,7 @@ int main (void)
    return main1 ();
  } 
  
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
   */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
  /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c

index 684d12dfcb437c4d8862a3e957157c725367e528..557bf2fb47046bee375b2f573e16210915ac5518 100644 (file)
--- a/gcc/tree-vect-analyze.c
+++ b/gcc/tree-vect-analyze.c
@@ -596,8 +596,8 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
        return false;
      }
  
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * vectorization_factor;
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * vectorization_factor) - 1);
  
    /* Use the cost model only if it is more conservative than user specified
       threshold.  */
@@ -609,7 +609,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
      th = (unsigned) min_profitable_iters;
  
    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
      {
        if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))          
          fprintf (vect_dump, "not vectorized: vectorization not "
diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c

index 30dbf712e55d15aecb5ed28b0adc633df5b5b731..e2ee92b0d0e541fb8f00309bbebb4e60c95971b4 100644 (file)
--- a/gcc/tree-vect-transform.c
+++ b/gcc/tree-vect-transform.c
@@ -124,6 +124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
    basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
    int nbbs = loop->num_nodes;
    int byte_misalign;
+  int peel_guard_costs = 0;
    int innerloop_iters = 0, factor;
    VEC (slp_instance, heap) *slp_instances;
    slp_instance instance;
@@ -141,7 +142,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
    if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
      {
-      vec_outside_cost += TARG_COND_BRANCH_COST;
+      vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
        if (vect_print_dump_info (REPORT_DETAILS))
          fprintf (vect_dump, "cost model: Adding cost of checks for loop "
                   "versioning.\n");
@@ -188,7 +189,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       loop.
  
       FORNOW: If we dont know the value of peel_iters for prologue or epilogue
-     at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+     at compile-time - we assume it's vf/2 (the worst would be vf-1).
  
       TODO: Build an expression that represents peel_iters for prologue and
       epilogue to be used in a run-time test.  */
@@ -197,18 +198,26 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
    if (byte_misalign < 0)
      {
-      peel_iters_prologue = (vf - 1)/2;
+      peel_iters_prologue = vf/2;
        if (vect_print_dump_info (REPORT_DETAILS))
          fprintf (vect_dump, "cost model: "
-                 "prologue peel iters set to (vf-1)/2.");
+                 "prologue peel iters set to vf/2.");
  
        /* If peeling for alignment is unknown, loop bound of main loop becomes
           unknown.  */
-      peel_iters_epilogue = (vf - 1)/2;
+      peel_iters_epilogue = vf/2;
        if (vect_print_dump_info (REPORT_DETAILS))
          fprintf (vect_dump, "cost model: "
-                 "epilogue peel iters set to (vf-1)/2 because "
+                 "epilogue peel iters set to vf/2 because "
                   "peeling for alignment is unknown .");
+
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop. Even if scalar loop iterations are known, 
+        vector iterations are not known since peeled prologue iterations are
+        not known. Hence guards remain the same.  */
+      peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
+                              + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
      }
    else 
      {
@@ -226,11 +235,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
        if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
          {
-          peel_iters_epilogue = (vf - 1)/2;
+          peel_iters_epilogue = vf/2;
            if (vect_print_dump_info (REPORT_DETAILS))
              fprintf (vect_dump, "cost model: "
-                     "epilogue peel iters set to (vf-1)/2 because "
+                     "epilogue peel iters set to vf/2 because "
                       "loop iterations are unknown .");
+
+         /* If peeled iterations are known but number of scalar loop
+            iterations are unknown, count a taken branch per peeled loop.  */
+         peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
+
          }
        else      
         {
@@ -241,33 +255,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
         }
      }
  
-  /* Requires a prologue loop when peeling to handle misalignment. Add cost of
-     two guards, one for the peeled loop and one for the vector loop.  */
-
-  if (peel_iters_prologue)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model: Adding cost of checks for "
-                 "prologue.\n");
-    }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
-    loop. Add cost of two guards, one for the peeled loop and one for the
-    vector loop.  */
-
-  if (peel_iters_epilogue
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model : Adding cost of checks for "
-                 "epilogue.\n");
-    }
-
    vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-                      + (peel_iters_epilogue * scalar_single_iter_cost);
+                      + (peel_iters_epilogue * scalar_single_iter_cost)
+                      + peel_guard_costs;
  
    /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
       information we provide for the target is whether testing against the
@@ -305,11 +295,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
  
    if ((scalar_single_iter_cost * vf) > vec_inside_cost)
      {
-      if (vec_outside_cost == 0)
+      if (vec_outside_cost <= 0)
          min_profitable_iters = 1;
        else
          {
-          min_profitable_iters = (vec_outside_cost * vf)
+          min_profitable_iters = (vec_outside_cost * vf 
+                                  - vec_inside_cost * peel_iters_prologue
+                                  - vec_inside_cost * peel_iters_epilogue)
                                   / ((scalar_single_iter_cost * vf)
                                      - vec_inside_cost);
  
@@ -344,8 +336,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                 peel_iters_epilogue);
        fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
                min_profitable_iters);
-      fprintf (vect_dump, "  Actual minimum iters for profitability: %d\n",
-              min_profitable_iters < vf ? vf : min_profitable_iters);
      }
  
    min_profitable_iters = 
@@ -355,6 +345,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       if (niters <= min_profitable_iters)
         then skip the vectorized loop.  */
    min_profitable_iters--;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "  Profitability threshold = %d\n",
+            min_profitable_iters);
+    
    return min_profitable_iters;
  }
  
@@ -6452,8 +6447,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
  
    /* Analyze cost to set threshhold for vectorized loop.  */
    min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
  
    /* Use the cost model only if it is more conservative than user specified
       threshold.  */
@@ -6464,8 +6459,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
            || min_profitable_iters > min_scalar_loop_bound))
      th = (unsigned) min_profitable_iters;
  
-  if (min_profitable_iters
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+  if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
        && vect_print_dump_info (REPORT_DETAILS))
      fprintf (vect_dump, "vectorization may not be profitable.");
  
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 913c524137b1f4d56df35094ac976bd92e7ab3c6..49ee045237841a1451da04ab8ecf17cfcfc44db9 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -469,9 +469,14 @@ typedef struct _stmt_vec_info {
  /* These are some defines for the initial implementation of the vectorizer's
     cost model.  These will later be target specific hooks.  */
  
-/* Cost of conditional branch.  */
-#ifndef TARG_COND_BRANCH_COST
-#define TARG_COND_BRANCH_COST        3
+/* Cost of conditional taken branch.  */
+#ifndef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST        3
+#endif
+
+/* Cost of conditional not taken branch.  */
+#ifndef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST        1
  #endif
  
  /* Cost of any scalar operation, excluding load and store.  */
author	Harsha Jagasia <harsha.jagasia@amd.com>
	Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
committer	Harsha Jagasia <hjagasia@gcc.gnu.org>
	Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/i386/i386.c		patch \| blob \| history
gcc/config/i386/i386.h		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c		patch \| blob \| history
gcc/tree-vect-analyze.c		patch \| blob \| history
gcc/tree-vect-transform.c		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history