rsha Jagasia <harsha.jagasia@amd.com>
authorHarsha Jagasia <harsha.jagasia@amd.com>
Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
committerHarsha Jagasia <hjagasia@gcc.gnu.org>
Tue, 11 Sep 2007 00:13:47 +0000 (00:13 +0000)
            Jan Sjodin <jan.sjodin@amd.com>

        * tree-vect-analyze.c (vect_analyze_operations): Change
        comparison of loop iterations with threshold to less than
        or equal to instead of less than. Reduce
        min_scalar_loop_bound by one.
        * tree-vect-transform.c (vect_estimate_min_profitable_iters):
        Change prologue and epilogue iterations estimate to vf/2,
        when unknown at compile-time. Change versioning guard
        cost to taken_branch_cost. If peeling for alignment is
        unknown at compile-time, change peel guard costs to one
        taken branch and one not-taken branch per peeled loop.
        If peeling for alignment is known but number of scalar loop
        iterations is unknown at compile-time, change peel guard
        costs to one taken branch per peeled loop. Change the cost
        model equation to consider vector iterations as the loop
        iterations less the prologue and epilogue iterations.
        Change outside vector cost check to less than or equal to
        zero instead of equal to zero.
        (vect_do_peeling_for_loop_bound): Reduce
        min_scalar_loop_bound by one.
        * tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
        TARG_COND_NOT_TAKEN_BRANCH_COST.
        * config/i386/i386.h (processor_costs): Add
        scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
        vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
        vec_align_load_cost, vect_unalign_load_cost,
        vec_store_cost, cond_taken_branch_cost,
        cond_not_taken_branch_cost.
        Define macros for x86 costs.
        * config/i386/i386.c:
        (size_cost): Set scalar_stmt_cost, scalar_load_cost,
        scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
        scalar_to_vec_cost, vec_align_load_cost,
        vect_unalign_load_cost, vec_store_cost,
        cond_taken_branch_cost, cond_not_taken_branch_cost to one.
        (i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
        geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost,
        core2_cost, generic64_cost, generic32_cost): Set to default
        untuned costs.
        (k8_cost, amdfam10_cost): Costs for vectorization tuned.
        (x86_builtin_vectorization_cost): New.

2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>

        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c:
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
        Change dg-final to expect 1 non-profitable loop and
        3 profitable loops.
        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.
        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
        Change dg-final to expect 1 profitable loop.

Co-Authored-By: Jan Sjodin <jan.sjodin@amd.com>
From-SVN: r128353

gcc/ChangeLog
gcc/config/i386/i386.c
gcc/config/i386/i386.h
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c
gcc/testsuite/gcc.dg/vect/costmodel/i386/costmodel-vect-31.c
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-fast-math-vect-pr29925.c
gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-vect-31.c
gcc/tree-vect-analyze.c
gcc/tree-vect-transform.c
gcc/tree-vectorizer.h

index 9bcfca14751ab7038dc004857d82b6fccf817748..ab75d14b9122ee9f840e2f69d42f9f8427c1c63d 100644 (file)
@@ -1,3 +1,47 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+            Jan Sjodin <jan.sjodin@amd.com>
+       
+       * tree-vect-analyze.c (vect_analyze_operations): Change
+       comparison of loop iterations with threshold to less than
+       or equal to instead of less than. Reduce
+       min_scalar_loop_bound by one.
+       * tree-vect-transform.c (vect_estimate_min_profitable_iters): 
+       Change prologue and epilogue iterations estimate to vf/2,
+       when unknown at compile-time. Change versioning guard
+       cost to taken_branch_cost. If peeling for alignment is
+       unknown at compile-time, change peel guard costs to one
+       taken branch and one not-taken branch per peeled loop.
+       If peeling for alignment is known but number of scalar loop
+       iterations is unknown at compile-time, change peel guard
+       costs to one taken branch per peeled loop. Change the cost
+       model equation to consider vector iterations as the loop
+       iterations less the prologue and epilogue iterations.
+       Change outside vector cost check to less than or equal to
+       zero instead of equal to zero.
+       (vect_do_peeling_for_loop_bound): Reduce
+       min_scalar_loop_bound by one.
+       * tree-vectorizer.h: Add TARG_COND_TAKEN_BRANCH_COST and
+       TARG_COND_NOT_TAKEN_BRANCH_COST.        
+       * config/i386/i386.h (processor_costs): Add
+       scalar_stmt_cost, scalar_load_cost, scalar_store_cost,
+       vec_stmt_cost, vec_to_scalar_cost, scalar_to_vec_cost,
+       vec_align_load_cost, vect_unalign_load_cost,
+       vec_store_cost, cond_taken_branch_cost,
+       cond_not_taken_branch_cost.
+       Define macros for x86 costs.
+       * config/i386/i386.c:
+       (size_cost): Set scalar_stmt_cost, scalar_load_cost,
+       scalar_store_cost, vec_stmt_cost, vec_to_scalar_cost,
+       scalar_to_vec_cost, vec_align_load_cost, 
+       vect_unalign_load_cost, vec_store_cost,
+       cond_taken_branch_cost, cond_not_taken_branch_cost to one. 
+       (i386_cost, i486_cost, pentium_cost, pentiumpro_cost,
+       geode_cost, k6_cost, athlon_cost, pentium4_cost, nocona_cost, 
+       core2_cost, generic64_cost, generic32_cost): Set to default
+       untuned costs.
+       (k8_cost, amdfam10_cost): Costs for vectorization tuned.
+       (x86_builtin_vectorization_cost): New.
+
 2007-09-10  Janis Johnson  <janis187@us.ibm.com>
            Ben Elliston  <bje@au.ibm.com>
 
index b1dda95adc7bd1206e1267b16005912aa3b3781b..ce7c19b46ec59ecbd5b6eb77e4558f0c64d289cd 100644 (file)
@@ -52,6 +52,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "tm-constrs.h"
 #include "params.h"
 
+static int x86_builtin_vectorization_cost (bool);
+
 #ifndef CHECK_STACK_LIMIT
 #define CHECK_STACK_LIMIT (-1)
 #endif
@@ -126,7 +128,18 @@ struct processor_costs size_cost = {       /* costs for tuning for size */
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
    {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
-   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}}
+   {rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  1,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  1,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Processor costs (relative to an add) */
@@ -187,6 +200,17 @@ struct processor_costs i386_cost = {       /* 386 specific costs */
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_1_byte, {{-1, rep_prefix_1_byte}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -247,7 +271,18 @@ struct processor_costs i486_cost = {       /* 486 specific costs */
   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_4_byte, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -306,7 +341,18 @@ struct processor_costs pentium_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{-1, rep_prefix_4_byte}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -372,7 +418,18 @@ struct processor_costs pentiumpro_cost = {
    DUMMY_STRINGOP_ALGS},
   {{rep_prefix_4_byte, {{1024, unrolled_loop},
                        {8192, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -432,7 +489,18 @@ struct processor_costs geode_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -494,7 +562,18 @@ struct processor_costs k6_cost = {
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{256, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -556,7 +635,18 @@ struct processor_costs athlon_cost = {
   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{2048, rep_prefix_4_byte}, {-1, libcall}}},
-   DUMMY_STRINGOP_ALGS}
+   DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -624,7 +714,18 @@ struct processor_costs k8_cost = {
    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {{libcall, {{8, loop}, {24, unrolled_loop},
              {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  5,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  3,                                    /* vec_unalign_load_cost.  */
+  3,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 struct processor_costs amdfam10_cost = {
@@ -700,7 +801,18 @@ struct processor_costs amdfam10_cost = {
    {libcall, {{16, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {{libcall, {{8, loop}, {24, unrolled_loop},
              {2048, rep_prefix_4_byte}, {-1, libcall}}},
-   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{48, unrolled_loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  4,                                    /* scalar_stmt_cost.  */
+  2,                                    /* scalar load_cost.  */
+  2,                                    /* scalar_store_cost.  */
+  6,                                    /* vec_stmt_cost.  */
+  0,                                    /* vec_to_scalar_cost.  */
+  2,                                    /* scalar_to_vec_cost.  */
+  2,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  2,                                    /* vec_store_cost.  */
+  6,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -761,6 +873,17 @@ struct processor_costs pentium4_cost = {
   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
    {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -822,7 +945,18 @@ struct processor_costs nocona_cost = {
   {{libcall, {{6, loop_1_byte}, {48, loop}, {20480, rep_prefix_4_byte},
    {-1, libcall}}},
    {libcall, {{24, loop}, {64, unrolled_loop},
-             {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+             {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 static const
@@ -883,7 +1017,18 @@ struct processor_costs core2_cost = {
   {{libcall, {{8, loop}, {15, unrolled_loop},
              {2048, rep_prefix_4_byte}, {-1, libcall}}},
    {libcall, {{24, loop}, {32, unrolled_loop},
-             {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+             {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
@@ -949,7 +1094,18 @@ struct processor_costs generic64_cost = {
   {DUMMY_STRINGOP_ALGS,
    {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
   {DUMMY_STRINGOP_ALGS,
-   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}}
+   {libcall, {{32, loop}, {8192, rep_prefix_8_byte}, {-1, libcall}}}},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 /* Generic32 should produce code tuned for Athlon, PPro, Pentium4, Nocona and K8.  */
@@ -1010,6 +1166,17 @@ struct processor_costs generic32_cost = {
    DUMMY_STRINGOP_ALGS},
   {{libcall, {{32, loop}, {8192, rep_prefix_4_byte}, {-1, libcall}}},
    DUMMY_STRINGOP_ALGS},
+  1,                                    /* scalar_stmt_cost.  */
+  1,                                    /* scalar load_cost.  */
+  1,                                    /* scalar_store_cost.  */
+  1,                                    /* vec_stmt_cost.  */
+  1,                                    /* vec_to_scalar_cost.  */
+  1,                                    /* scalar_to_vec_cost.  */
+  1,                                    /* vec_align_load_cost.  */
+  2,                                    /* vec_unalign_load_cost.  */
+  1,                                    /* vec_store_cost.  */
+  3,                                    /* cond_taken_branch_cost.  */
+  1,                                    /* cond_not_taken_branch_cost.  */
 };
 
 const struct processor_costs *ix86_cost = &pentium_cost;
@@ -23615,6 +23782,30 @@ static const struct attribute_spec ix86_attribute_table[] =
   { NULL,        0, 0, false, false, false, NULL }
 };
 
+/* Implement targetm.vectorize.builtin_vectorization_cost.  */
+static int 
+x86_builtin_vectorization_cost (bool runtime_test)
+{
+  /* If the branch of the runtime test is taken - i.e. - the vectorized
+     version is skipped - this incurs a misprediction cost (because the
+     vectorized version is expected to be the fall-through).  So we subtract
+     the latency of a mispredicted branch from the costs that are incured
+     when the vectorized version is executed.
+
+     TODO: The values in individual target tables have to be tuned or new
+     fields may be needed. For eg. on K8, the default branch path is the
+     not-taken path. If the taken path is predicted correctly, the minimum
+     penalty of going down the taken-path is 1 cycle. If the taken-path is
+     not predicted correctly, then the minimum penalty is 10 cycles.  */
+
+  if (runtime_test)
+    {
+      return (-(ix86_cost->cond_taken_branch_cost));
+    }
+  else
+    return 0;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_ATTRIBUTE_TABLE
 #define TARGET_ATTRIBUTE_TABLE ix86_attribute_table
@@ -23791,6 +23982,9 @@ static const struct attribute_spec ix86_attribute_table[] =
 #undef TARGET_FUNCTION_VALUE
 #define TARGET_FUNCTION_VALUE ix86_function_value
 
+#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST x86_builtin_vectorization_cost
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 \f
 #include "gt-i386.h"
index e18e90f600cd4be3b9a73f0ae6b8700c0a655b26..a14c74b101d583c3bfa3e6c5730f1b3666a3eb3d 100644 (file)
@@ -138,6 +138,22 @@ struct processor_costs {
                                /* Specify what algorithm
                                   to use for stringops on unknown size.  */
   struct stringop_algs memcpy[2], memset[2];
+  const int scalar_stmt_cost;   /* Cost of any scalar operation, excluding
+                                  load and store.  */
+  const int scalar_load_cost;   /* Cost of scalar load.  */
+  const int scalar_store_cost;  /* Cost of scalar store.  */
+  const int vec_stmt_cost;      /* Cost of any vector operation, excluding
+                                   load, store, vector-to-scalar and
+                                   scalar-to-vector operation.  */
+  const int vec_to_scalar_cost;    /* Cost of vect-to-scalar operation.  */
+  const int scalar_to_vec_cost;    /* Cost of scalar-to-vector operation.  */
+  const int vec_align_load_cost;   /* Cost of aligned vector load.  */ 
+  const int vec_unalign_load_cost; /* Cost of unaligned vector load.  */
+  const int vec_store_cost;        /* Cost of vector store.  */
+  const int cond_taken_branch_cost;    /* Cost of taken branch for vectorizer
+                                         cost model.  */
+  const int cond_not_taken_branch_cost;/* Cost of not taken branch for
+                                         vectorizer cost model.  */
 };
 
 extern const struct processor_costs *ix86_cost;
@@ -2460,6 +2476,57 @@ struct machine_function GTY(())
 #define SYMBOL_REF_DLLEXPORT_P(X) \
        ((SYMBOL_REF_FLAGS (X) & SYMBOL_FLAG_DLLEXPORT) != 0)
 
+/* Model costs for vectorizer.  */
+
+/* Cost of conditional branch.  */
+#undef TARG_COND_BRANCH_COST
+#define TARG_COND_BRANCH_COST           ix86_cost->branch_cost
+
+/* Cost of any scalar operation, excluding load and store.  */
+#undef TARG_SCALAR_STMT_COST
+#define TARG_SCALAR_STMT_COST           ix86_cost->scalar_stmt_cost
+
+/* Cost of scalar load.  */
+#undef TARG_SCALAR_LOAD_COST
+#define TARG_SCALAR_LOAD_COST           ix86_cost->scalar_load_cost
+
+/* Cost of scalar store.  */
+#undef TARG_SCALAR_STORE_COST
+#define TARG_SCALAR_STORE_COST          ix86_cost->scalar_store_cost
+
+/* Cost of any vector operation, excluding load, store or vector to scalar
+   operation.  */ 
+#undef TARG_VEC_STMT_COST
+#define TARG_VEC_STMT_COST              ix86_cost->vec_stmt_cost
+
+/* Cost of vector to scalar operation.  */
+#undef TARG_VEC_TO_SCALAR_COST
+#define TARG_VEC_TO_SCALAR_COST         ix86_cost->vec_to_scalar_cost
+
+/* Cost of scalar to vector operation.  */
+#undef TARG_SCALAR_TO_VEC_COST
+#define TARG_SCALAR_TO_VEC_COST         ix86_cost->scalar_to_vec_cost
+
+/* Cost of aligned vector load.  */
+#undef TARG_VEC_LOAD_COST
+#define TARG_VEC_LOAD_COST              ix86_cost->vec_align_load_cost
+
+/* Cost of misaligned vector load.  */
+#undef TARG_VEC_UNALIGNED_LOAD_COST
+#define TARG_VEC_UNALIGNED_LOAD_COST    ix86_cost->vec_unalign_load_cost
+
+/* Cost of vector store.  */
+#undef TARG_VEC_STORE_COST
+#define TARG_VEC_STORE_COST             ix86_cost->vec_store_cost
+
+/* Cost of conditional taken branch for vectorizer cost model.  */
+#undef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST     ix86_cost->cond_taken_branch_cost
+
+/* Cost of conditional not taken branch for vectorizer cost model.  */
+#undef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST ix86_cost->cond_not_taken_branch_cost
+
 /*
 Local variables:
 version-control: t
index 792984e9ae39e6abd805fd98c0be6c09592a1de2..aca143371ff4cad73110b0582d365ad9e027cda6 100644 (file)
@@ -1,3 +1,16 @@
+2007-09-10  Harsha Jagasia <harsha.jagasia@amd.com>
+
+        * gcc.dg/vect/costmodel/i386/costmodel-vect-31.c: 
+       Change dg-final to expect 1 non-profitable loop and
+       3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-vect-31.c:
+       Change dg-final to expect 1 non-profitable loop and
+       3 profitable loops.
+        * gcc.dg/vect/costmodel/x86-64/costmodel-fast-math-vect-pr29925.c:
+       Change dg-final to expect 1 profitable loop.
+        * gcc.dg/vect/costmodel/i386/costmodel-fast-math-vect-pr29925.c:
+       Change dg-final to expect 1 profitable loop.    
+       
 2007-09-10  Richard Sandiford  <richard@codesourcery.com>
 
        * gcc.target/mips/call-saved-1.c: New test.
index 5d84017dd240e485bfe8b9ae83230cc551987d57..2766ced28715ed0be2f12a8e3e077c569b16389a 100644 (file)
@@ -35,6 +35,6 @@ int main()
    return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
index 11e8a29eada5b2356a6b7e6b2a41ac34a29b81e9..efab30d4ac6c5a08c7d9766f996b100c5518d444 100644 (file)
@@ -85,7 +85,7 @@ int main (void)
   return main1 ();
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
  */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
index 9a9f97e802bee532d950d4e8b4608747cb76afe2..0bc09f6da32eed09e0105e9a5f6888e2795186d8 100644 (file)
@@ -35,6 +35,6 @@ int main()
    return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
 
index 11e8a29eada5b2356a6b7e6b2a41ac34a29b81e9..efab30d4ac6c5a08c7d9766f996b100c5518d444 100644 (file)
@@ -85,7 +85,7 @@ int main (void)
   return main1 ();
 } 
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 2 "vect" } }
+/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" } }
  */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
index 684d12dfcb437c4d8862a3e957157c725367e528..557bf2fb47046bee375b2f573e16210915ac5518 100644 (file)
@@ -596,8 +596,8 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
       return false;
     }
 
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * vectorization_factor;
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -609,7 +609,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo)
     th = (unsigned) min_profitable_iters;
 
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
+      && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
     {
       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS))          
         fprintf (vect_dump, "not vectorized: vectorization not "
index 30dbf712e55d15aecb5ed28b0adc633df5b5b731..e2ee92b0d0e541fb8f00309bbebb4e60c95971b4 100644 (file)
@@ -124,6 +124,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
   int nbbs = loop->num_nodes;
   int byte_misalign;
+  int peel_guard_costs = 0;
   int innerloop_iters = 0, factor;
   VEC (slp_instance, heap) *slp_instances;
   slp_instance instance;
@@ -141,7 +142,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)))
     {
-      vec_outside_cost += TARG_COND_BRANCH_COST;
+      vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
                  "versioning.\n");
@@ -188,7 +189,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      loop.
 
      FORNOW: If we dont know the value of peel_iters for prologue or epilogue
-     at compile-time - we assume it's (vf-1)/2 (the worst would be vf-1).
+     at compile-time - we assume it's vf/2 (the worst would be vf-1).
 
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
@@ -197,18 +198,26 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if (byte_misalign < 0)
     {
-      peel_iters_prologue = (vf - 1)/2;
+      peel_iters_prologue = vf/2;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: "
-                 "prologue peel iters set to (vf-1)/2.");
+                 "prologue peel iters set to vf/2.");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
-      peel_iters_epilogue = (vf - 1)/2;
+      peel_iters_epilogue = vf/2;
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "cost model: "
-                 "epilogue peel iters set to (vf-1)/2 because "
+                 "epilogue peel iters set to vf/2 because "
                  "peeling for alignment is unknown .");
+
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop. Even if scalar loop iterations are known, 
+        vector iterations are not known since peeled prologue iterations are
+        not known. Hence guards remain the same.  */
+      peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
+                              + TARG_COND_NOT_TAKEN_BRANCH_COST);
+
     }
   else 
     {
@@ -226,11 +235,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
         {
-          peel_iters_epilogue = (vf - 1)/2;
+          peel_iters_epilogue = vf/2;
           if (vect_print_dump_info (REPORT_DETAILS))
             fprintf (vect_dump, "cost model: "
-                     "epilogue peel iters set to (vf-1)/2 because "
+                     "epilogue peel iters set to vf/2 because "
                      "loop iterations are unknown .");
+
+         /* If peeled iterations are known but number of scalar loop
+            iterations are unknown, count a taken branch per peeled loop.  */
+         peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
+
         }
       else      
        {
@@ -241,33 +255,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        }
     }
 
-  /* Requires a prologue loop when peeling to handle misalignment. Add cost of
-     two guards, one for the peeled loop and one for the vector loop.  */
-
-  if (peel_iters_prologue)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model: Adding cost of checks for "
-                 "prologue.\n");
-    }
-
- /* Requires an epilogue loop to finish up remaining iterations after vector
-    loop. Add cost of two guards, one for the peeled loop and one for the
-    vector loop.  */
-
-  if (peel_iters_epilogue
-      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vf)
-    {
-      vec_outside_cost += 2 * TARG_COND_BRANCH_COST;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "cost model : Adding cost of checks for "
-                 "epilogue.\n");
-    }
-
   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
-                      + (peel_iters_epilogue * scalar_single_iter_cost);
+                      + (peel_iters_epilogue * scalar_single_iter_cost)
+                      + peel_guard_costs;
 
   /* Allow targets add additional (outside-of-loop) costs. FORNOW, the only
      information we provide for the target is whether testing against the
@@ -305,11 +295,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
 
   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
     {
-      if (vec_outside_cost == 0)
+      if (vec_outside_cost <= 0)
         min_profitable_iters = 1;
       else
         {
-          min_profitable_iters = (vec_outside_cost * vf)
+          min_profitable_iters = (vec_outside_cost * vf 
+                                  - vec_inside_cost * peel_iters_prologue
+                                  - vec_inside_cost * peel_iters_epilogue)
                                  / ((scalar_single_iter_cost * vf)
                                     - vec_inside_cost);
 
@@ -344,8 +336,6 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                peel_iters_epilogue);
       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
               min_profitable_iters);
-      fprintf (vect_dump, "  Actual minimum iters for profitability: %d\n",
-              min_profitable_iters < vf ? vf : min_profitable_iters);
     }
 
   min_profitable_iters = 
@@ -355,6 +345,11 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
      if (niters <= min_profitable_iters)
        then skip the vectorized loop.  */
   min_profitable_iters--;
+
+  if (vect_print_dump_info (REPORT_DETAILS))
+    fprintf (vect_dump, "  Profitability threshold = %d\n",
+            min_profitable_iters);
+    
   return min_profitable_iters;
 }
 
@@ -6452,8 +6447,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
 
   /* Analyze cost to set threshhold for vectorized loop.  */
   min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo);
-  min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND))
-                          * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                           * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -6464,8 +6459,8 @@ vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio)
           || min_profitable_iters > min_scalar_loop_bound))
     th = (unsigned) min_profitable_iters;
 
-  if (min_profitable_iters
-      && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+  if (((LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
       && vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "vectorization may not be profitable.");
 
index 913c524137b1f4d56df35094ac976bd92e7ab3c6..49ee045237841a1451da04ab8ecf17cfcfc44db9 100644 (file)
@@ -469,9 +469,14 @@ typedef struct _stmt_vec_info {
 /* These are some defines for the initial implementation of the vectorizer's
    cost model.  These will later be target specific hooks.  */
 
-/* Cost of conditional branch.  */
-#ifndef TARG_COND_BRANCH_COST
-#define TARG_COND_BRANCH_COST        3
+/* Cost of conditional taken branch.  */
+#ifndef TARG_COND_TAKEN_BRANCH_COST
+#define TARG_COND_TAKEN_BRANCH_COST        3
+#endif
+
+/* Cost of conditional not taken branch.  */
+#ifndef TARG_COND_NOT_TAKEN_BRANCH_COST
+#define TARG_COND_NOT_TAKEN_BRANCH_COST        1
 #endif
 
 /* Cost of any scalar operation, excluding load and store.  */