tree-parloops.c (loop_parallel_p): Call vect_is_simple_reduction with additional...
authorIra Rosen <irar@il.ibm.com>
Sun, 12 Jul 2009 07:09:07 +0000 (07:09 +0000)
committerIra Rosen <irar@gcc.gnu.org>
Sun, 12 Jul 2009 07:09:07 +0000 (07:09 +0000)
* tree-parloops.c (loop_parallel_p): Call vect_is_simple_reduction
with additional argument.
* tree-vectorizer.h (enum vect_def_type): Add
vect_double_reduction_def.
(vect_is_simple_reduction): Add argument.
* tree-vect-loop.c (vect_determine_vectorization_factor): Fix
indentation.
(vect_analyze_scalar_cycles_1): Detect double reduction. Call
vect_is_simple_reduction with additional argument.
(vect_analyze_loop_operations): Handle exit phi nodes in case of
double reduction.
(reduction_code_for_scalar_code): Handle additional codes by
returning ERROR_MARK for them. Fix comment and indentation.
(vect_is_simple_reduction): Fix comment, add argument to specify
double reduction. Detect double reduction.
(get_initial_def_for_induction): Fix indentation.
(get_initial_def_for_reduction): Fix comment and indentation.
Handle double reduction. Create initial definitions that do not
require adjustment if ADJUSTMENT_DEF is NULL. Handle additional cases.
(vect_create_epilog_for_reduction): Fix comment, add argument to
handle double reduction. Use PLUS_EXPR in case of MINUS_EXPR in
epilogue result extraction. Create double reduction phi node and
replace relevant uses.
(vectorizable_reduction): Call vect_is_simple_reduction with
additional argument. Fix indentation. Update epilogue code treatment
according to the changes in reduction_code_for_scalar_code. Check
for double reduction. Call vect_create_epilog_for_reduction with
additional argument.
* tree-vect-stmts.c (process_use): Handle double reduction, update
documentation.
(vect_mark_stmts_to_be_vectorized): Handle double reduction.
(vect_get_vec_def_for_operand): Likewise.

From-SVN: r149526

14 files changed:
gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/no-scevccp-outer-2.c
gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c [new file with mode: 0644]
gcc/tree-parloops.c
gcc/tree-vect-loop.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.h

index c8a39500b80073aa24f8364d0c70106dfc253c07..edeb0492b36ab990141f4e2c1f2d8087d0675c7e 100644 (file)
@@ -1,3 +1,38 @@
+2009-07-12  Ira Rosen  <irar@il.ibm.com>
+
+       * tree-parloops.c (loop_parallel_p): Call vect_is_simple_reduction
+       with additional argument.
+       * tree-vectorizer.h (enum vect_def_type): Add 
+       vect_double_reduction_def.
+       (vect_is_simple_reduction): Add argument.
+       * tree-vect-loop.c (vect_determine_vectorization_factor): Fix 
+       indentation.
+       (vect_analyze_scalar_cycles_1): Detect double reduction. Call
+       vect_is_simple_reduction with additional argument.
+       (vect_analyze_loop_operations): Handle exit phi nodes in case of
+       double reduction.
+       (reduction_code_for_scalar_code): Handle additional codes by
+       returning ERROR_MARK for them. Fix comment and indentation.
+       (vect_is_simple_reduction): Fix comment, add argument to specify
+       double reduction. Detect double reduction.
+       (get_initial_def_for_induction): Fix indentation.
+       (get_initial_def_for_reduction): Fix comment and indentation.
+       Handle double reduction. Create initial definitions that do not
+       require adjustment if ADJUSTMENT_DEF is NULL. Handle additional cases.
+       (vect_create_epilog_for_reduction): Fix comment, add argument to
+       handle double reduction. Use PLUS_EXPR in case of MINUS_EXPR in
+       epilogue result extraction. Create double reduction phi node and
+       replace relevant uses.
+       (vectorizable_reduction): Call vect_is_simple_reduction with
+       additional argument. Fix indentation. Update epilogue code treatment
+       according to the changes in reduction_code_for_scalar_code. Check 
+       for double reduction. Call vect_create_epilog_for_reduction with
+       additional argument.
+       * tree-vect-stmts.c (process_use): Handle double reduction, update
+       documentation.
+       (vect_mark_stmts_to_be_vectorized): Handle double reduction.
+       (vect_get_vec_def_for_operand): Likewise.
+
 2009-07-12  Danny Smith  <dansmister@gmail.com>
 
        * config/i386/winnt.c (i386_pe_determine_dllexport_p): Don't
index 06d7675748b444988320447719415dd59daf0716..7df599ed587287c49bf3957f6d30a9e5e0f2639a 100644 (file)
@@ -1,3 +1,11 @@
+2009-07-12  Ira Rosen  <irar@il.ibm.com>
+
+       * gcc.dg/vect/no-scevccp-outer-2.c: Expect to vectorize.
+       * gcc.dg/vect/vect-double-reduc-1.c, gcc.dg/vect/vect-double-reduc-2.c,
+       gcc.dg/vect/vect-double-reduc-3.c, gcc.dg/vect/vect-double-reduc-4.c,
+       gcc.dg/vect/vect-double-reduc-5.c, gcc.dg/vect/vect-double-reduc-6.c,
+       gcc.dg/vect/vect-double-reduc-7.c: New tests.
+
 2009-07-12  Hans-Peter Nilsson  <hp@axis.com>
 
        * gfortran.dg/f2003_io_4.f03, gfortran.dg/read_size_noadvance.f90,
index a9ac09c4a2b044f7cc67f210099515207e8dfad6..13b37883c2e9f489d78e40de319ab29dc84c0b9a 100644 (file)
@@ -1,4 +1,6 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
 #define N 40
 
 int
@@ -14,5 +16,5 @@ foo (){
   return diff;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect"  } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c
new file mode 100644 (file)
index 0000000..e335842
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {642816,660736,678656,696576,714496,732416,750336,768256,786176,804096,822016,839936,857856,875776,893696,911616,929536,947456,965376,983296,1001216,1019136,1037056,1054976,1072896,1090816,1108736,1126656,1144576,1162496,1180416,1198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c
new file mode 100644 (file)
index 0000000..be469be
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {357184,339264,321344,303424,285504,267584,249664,231744,213824,195904,177984,160064,142144,124224,106304,88384,70464,52544,34624,16704,-1216,-19136,-37056,-54976,-72896,-90816,-108736,-126656,-144576,-162496,-180416,-198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int res = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      res = 1000000;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          res -= in[i+k][j] * coeff[i][j];
+      out[k] = res;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c
new file mode 100644 (file)
index 0000000..87b5a04
--- /dev/null
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out_max[K], out_min[K];
+int check_max[K] = {62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93};
+int check_min[K] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+
+__attribute__ ((noinline)) void 
+foo (int x, int y)
+{
+  int max, min, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      max = x;
+      min = y;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++)
+          {
+            max = max < in[i+k][j] ? in[i+k][j] : max; 
+            min = min > in[i+k][j] ? in[i+k][j] : min; 
+          }
+      out_max[k] = max;
+      out_min[k] = min;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo(0, 0);
+
+  for (k = 0; k < K; k++)
+    if (out_max[k] != check_max[k] || out_min[k] != 0)
+      abort ();
+
+  foo(100, 45);
+
+  for (k = 0; k < K; k++)
+    if (out_min[k] != check_min[k] || out_max[k] != 100)
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c
new file mode 100644 (file)
index 0000000..90e0da7
--- /dev/null
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {652816,670736,688656,706576,724496,742416,760336,778256,796176,814096,832016,849936,867856,885776,903696,921616,939536,957456,975376,993296,1011216,1029136,1047056,1064976,1082896,1100816,1118736,1136656,1154576,1172496,1190416,1208336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 10000;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
new file mode 100644 (file)
index 0000000..f624d86
--- /dev/null
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+signed short in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+signed short coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {642816,660736,678656,696576,714496,732416,750336,768256,786176,804096,822016,839936,857856,875776,893696,911616,929536,947456,965376,983296,1001216,1019136,1037056,1054976,1072896,1090816,1108736,1126656,1144576,1162496,1180416,1198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+
+/* Vectorization of loops with multiple types and double reduction is not 
+   supported yet.  */       
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c
new file mode 100644 (file)
index 0000000..f52b32b
--- /dev/null
@@ -0,0 +1,50 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 4 
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {0,16,256,4096};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum;
+  int i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 1;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++)
+          sum *= in[i+k][j];
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for (i = 0; i < 2*K; i++)
+    for (j = 0; j < K; j++)
+      in[i][j] = (i+2)/3;
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c
new file mode 100644 (file)
index 0000000..9e7ced7
--- /dev/null
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {63,63,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int res_or, res_and, res_xor, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      res_or = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          res_or = res_or | in[i+k][j];
+      res_and = 1;
+      for (j = 0; j < K; j++)
+        for (i = 0; i < K; i++)
+          res_and = res_and & in[i+k][j];
+
+      res_xor = 0;
+      for (j = 0; j < K; j++)
+        for (i = 0; i < K; i++)
+          res_xor = res_xor ^ in[i+k][j];
+
+      out[k] = res_or + res_and + res_xor;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        out[i] = i+j;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 3 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
index 5f11fc77a1b43c59dbfb842c73c6daf2c81efb65..28c96a26f598ddc24827587a44c55dfffb21b1c3 100644 (file)
@@ -284,13 +284,15 @@ loop_parallel_p (struct loop *loop, htab_t reduction_list,
     {
       gimple phi = gsi_stmt (gsi);
       gimple reduc_stmt = NULL;
+      bool dummy;
 
       /* ??? TODO: Change this into a generic function that 
          recognizes reductions.  */
       if (!is_gimple_reg (PHI_RESULT (phi)))
        continue;
       if (simple_loop_info)
-       reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true);
+       reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, 
+                                               &dummy);
 
       /*  Create a reduction_info struct, initialize it and insert it to 
          the reduction list.  */
index a37e3c00f72f5ba39aee2049189d00bd2d5cf80d..c96fb04a814ce2a2df443fa95d50c30ef769f15b 100644 (file)
@@ -291,8 +291,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
            }
          else
            {
-
-             gcc_assert (! STMT_VINFO_DATA_REF (stmt_info)
+             gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
                          && !is_pattern_stmt_p (stmt_info));
 
              scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 
@@ -410,6 +409,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
   tree dumy;
   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
   gimple_stmt_iterator gsi;
+  bool double_reduc;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
@@ -477,26 +477,39 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 
       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
-      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle);
+      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle, 
+                                             &double_reduc);
       if (reduc_stmt)
         {
-          if (nested_cycle)
+          if (double_reduc)
             {
               if (vect_print_dump_info (REPORT_DETAILS))
-                fprintf (vect_dump, "Detected vectorizable nested cycle.");
+                fprintf (vect_dump, "Detected double reduction.");
 
-              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
+              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
-                                                        vect_nested_cycle;
+                                                    vect_double_reduction_def;
             }
-          else
+          else 
             {
-              if (vect_print_dump_info (REPORT_DETAILS))
-                fprintf (vect_dump, "Detected reduction.");
+              if (nested_cycle)
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected vectorizable nested cycle.");
 
-              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
-              STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
-                                                        vect_reduction_def;
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                             vect_nested_cycle;
+                }
+              else
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected reduction.");
+
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                           vect_reduction_def;
+                }
             }
         }
       else
@@ -1111,10 +1124,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
               /* inner-loop loop-closed exit phi in outer-loop vectorization
                  (i.e. a phi in the tail of the outer-loop).
                  FORNOW: we currently don't support the case that these phis
-                 are not used in the outerloop, cause this case requires
-                 to actually do something here.  */
-              if (!STMT_VINFO_RELEVANT_P (stmt_info)
-                  || STMT_VINFO_LIVE_P (stmt_info))
+                 are not used in the outerloop (unless it is double reduction,
+                 i.e., this phi is vect_reduction_def), cause this case 
+                 requires to actually do something here.  */
+              if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+                   || STMT_VINFO_LIVE_P (stmt_info))
+                  && STMT_VINFO_DEF_TYPE (stmt_info) 
+                     != vect_double_reduction_def)
                 {
                   if (vect_print_dump_info (REPORT_DETAILS))
                     fprintf (vect_dump,
@@ -1466,31 +1482,40 @@ vect_analyze_loop (struct loop *loop)
    Output:
    REDUC_CODE - the corresponding tree-code to be used to reduce the
       vector of partial results into a single scalar result (which
-      will also reside in a vector).
+      will also reside in a vector) or ERROR_MARK if the operation is
+      a supported reduction operation, but does not have such tree-code.
 
-   Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
+   Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
 static bool
 reduction_code_for_scalar_code (enum tree_code code,
                                 enum tree_code *reduc_code)
 {
   switch (code)
-  {
-  case MAX_EXPR:
-    *reduc_code = REDUC_MAX_EXPR;
-    return true;
-
-  case MIN_EXPR:
-    *reduc_code = REDUC_MIN_EXPR;
-    return true;
-
-  case PLUS_EXPR:
-    *reduc_code = REDUC_PLUS_EXPR;
-    return true;
-
-  default:
-    return false;
-  }
+    {
+      case MAX_EXPR:
+        *reduc_code = REDUC_MAX_EXPR;
+        return true;
+
+      case MIN_EXPR:
+        *reduc_code = REDUC_MIN_EXPR;
+        return true;
+
+      case PLUS_EXPR:
+        *reduc_code = REDUC_PLUS_EXPR;
+        return true;
+
+      case MULT_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case BIT_AND_EXPR:
+        *reduc_code = ERROR_MARK;
+        return true;
+
+      default:
+       return false;
+    }
 }
 
 
@@ -1507,7 +1532,7 @@ report_vect_op (gimple stmt, const char *msg)
 
 /* Function vect_is_simple_reduction
 
-   Detect a cross-iteration def-use cycle that represents a simple
+   (1) Detect a cross-iteration def-use cycle that represents a simple
    reduction computation. We look for the following pattern:
 
    loop_header:
@@ -1524,12 +1549,20 @@ report_vect_op (gimple stmt, const char *msg)
    Condition 1 is tested here.
    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  
 
-   Also detect a cross-iteration def-use cycle in nested loops, i.e., nested
-   cycles, if CHECK_REDUCTION is false.  */
+   (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 
+   nested cycles, if CHECK_REDUCTION is false.  
+
+   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
+   reductions:
+
+     a1 = phi < a0, a2 >
+     inner loop (def of a3)
+     a2 = phi < a3 >    
+*/
 
 gimple
 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi, 
-                          bool check_reduction)
+                          bool check_reduction, bool *double_reduc)
 {
   struct loop *loop = (gimple_bb (phi))->loop_father;
   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
@@ -1543,6 +1576,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
   tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
+  bool phi_def;
+
+  *double_reduc = false;
 
   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
      otherwise, we assume outer loop vectorization.  */
@@ -1584,14 +1620,24 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
       return NULL;
     }
 
-  if (!is_gimple_assign (def_stmt))
+  if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
       return NULL;
     }
 
-  name = gimple_assign_lhs (def_stmt);
+  if (is_gimple_assign (def_stmt))
+    {
+      name = gimple_assign_lhs (def_stmt);
+      phi_def = false;
+    }
+  else
+    {
+      name = PHI_RESULT (def_stmt);
+      phi_def = true;
+    }
+
   nloop_uses = 0;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
     {
@@ -1608,6 +1654,37 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
        }
     }
 
+  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
+     defined in the inner loop.  */
+  if (phi_def)
+    {
+      op1 = PHI_ARG_DEF (def_stmt, 0);
+
+      if (gimple_phi_num_args (def_stmt) != 1
+          || TREE_CODE (op1) != SSA_NAME)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "unsupported phi node definition.");
+
+          return NULL;
+        }
+
+      def1 = SSA_NAME_DEF_STMT (op1); 
+      if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 
+          && loop->inner
+          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
+          && is_gimple_assign (def1))
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            report_vect_op (def_stmt, "detected double reduction: ");
+          *double_reduc = true;
+          return def_stmt;
+        }
+
+      return NULL;
+    }
+
   code = gimple_assign_rhs_code (def_stmt);
 
   if (check_reduction 
@@ -1697,7 +1774,6 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
       return NULL;
     }
 
-
   /* Check that one def is the reduction def, defined by PHI,
      the other def is either defined in the loop ("vect_internal_def"),
      or it's an induction (defined by a loop-header phi-node).  */
@@ -2296,7 +2372,7 @@ get_initial_def_for_induction (gimple iv_phi)
   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
   gcc_assert (access_fn);
   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
-                                  &init_expr, &step_expr);
+                                    &init_expr, &step_expr);
   gcc_assert (ok);
   pe = loop_preheader_edge (iv_loop);
 
@@ -2306,7 +2382,8 @@ get_initial_def_for_induction (gimple iv_phi)
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
         been created during vectorization of previous stmts; We obtain it from
         the STMT_VINFO_VEC_STMT of the defining stmt. */
-      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
+      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, 
+                                           loop_preheader_edge (iv_loop));
       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
     }
   else
@@ -2507,18 +2584,16 @@ get_initial_def_for_induction (gimple iv_phi)
         vector of partial results.
 
    Option1 (adjust in epilog): Initialize the vector as follows:
-     add:         [0,0,...,0,0]
-     mult:        [1,1,...,1,1]
-     min/max:     [init_val,init_val,..,init_val,init_val]
-     bit and/or:  [init_val,init_val,..,init_val,init_val]
+     add/bit or/xor: [0,0,...,0,0]
+     mult/bit and:   [1,1,...,1,1]
+     min/max:        [init_val,init_val,..,init_val,init_val]
    and when necessary (e.g. add/mult case) let the caller know
    that it needs to adjust the result by init_val.
 
    Option2: Initialize the vector as follows:
-     add:         [0,0,...,0,init_val]
-     mult:        [1,1,...,1,init_val]
-     min/max:     [init_val,init_val,...,init_val]
-     bit and/or:  [init_val,init_val,...,init_val]
+     add/bit or/xor: [init_val,0,0,...,0]
+     mult/bit and:   [init_val,1,1,...,1]
+     min/max:        [init_val,init_val,...,init_val]
    and no adjustments are needed.
 
    For example, for the following code:
@@ -2533,11 +2608,14 @@ get_initial_def_for_induction (gimple iv_phi)
    the result at the end by 'init_val'.
 
    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries).
+   initialization vector is simpler (same element in all entries), if
+   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
+   
    A cost model should help decide between these two schemes.  */
 
 tree
-get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
+get_initial_def_for_reduction (gimple stmt, tree init_val, 
+                               tree *adjustment_def)
 {
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
@@ -2551,47 +2629,118 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
   tree t = NULL_TREE;
   int i;
   bool nested_in_vect_loop = false; 
+  tree init_value;
+  REAL_VALUE_TYPE real_init_val = dconst0;
+  int int_init_val = 0;
 
   gcc_assert (vectype);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
              || SCALAR_FLOAT_TYPE_P (scalar_type));
+
   if (nested_in_vect_loop_p (loop, stmt))
     nested_in_vect_loop = true;
   else
     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
 
-  switch (code)
-  {
-  case WIDEN_SUM_EXPR:
-  case DOT_PROD_EXPR:
-  case PLUS_EXPR:
-  case MINUS_EXPR:
-    if (nested_in_vect_loop)
-      *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    else
-      *adjustment_def = init_val;
-    /* Create a vector of zeros for init_def.  */
-    if (SCALAR_FLOAT_TYPE_P (scalar_type))
-      def_for_init = build_real (scalar_type, dconst0);
-    else
-      def_for_init = build_int_cst (scalar_type, 0);
-      
-    for (i = nunits - 1; i >= 0; --i)
-      t = tree_cons (NULL_TREE, def_for_init, t);
-    init_def = build_vector (vectype, t);
-    break;
+  /* In case of double reduction we only create a vector variable to be put
+     in the reduction phi node. The actual statement creation is done in
+     vect_create_epilog_for_reduction.  */
+  if (TREE_CODE (init_val) == SSA_NAME
+      && vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val)) 
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val))) 
+          == vect_double_reduction_def)
+    {
+      *adjustment_def = NULL;
+      return vect_create_destination_var (init_val, vectype);
+    }
 
-  case MIN_EXPR:
-  case MAX_EXPR:
-    *adjustment_def = NULL_TREE;
-    init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    break;
+  if (TREE_CONSTANT (init_val))
+    {
+      if (SCALAR_FLOAT_TYPE_P (scalar_type))
+        init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
+      else
+        init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
+    }
+  else
+    init_value = init_val;
 
-  default:
-    gcc_unreachable ();
-  }
+  switch (code)
+    {
+      case WIDEN_SUM_EXPR:
+      case DOT_PROD_EXPR:
+      case PLUS_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case MULT_EXPR:
+      case BIT_AND_EXPR:
+        /* ADJUSMENT_DEF is NULL when called from 
+           vect_create_epilog_for_reduction to vectorize double reduction.  */
+        if (adjustment_def)
+          {
+            if (nested_in_vect_loop)
+              *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, 
+                                                              NULL);
+            else
+              *adjustment_def = init_val;
+          }
+
+        if (code == MULT_EXPR || code == BIT_AND_EXPR)
+          {
+            real_init_val = dconst1;
+            int_init_val = 1;
+          }
+
+        if (SCALAR_FLOAT_TYPE_P (scalar_type))
+          def_for_init = build_real (scalar_type, real_init_val);
+        else
+          def_for_init = build_int_cst (scalar_type, int_init_val);
+
+        /* Create a vector of '0' or '1' except the first element.  */ 
+        for (i = nunits - 2; i >= 0; --i)
+          t = tree_cons (NULL_TREE, def_for_init, t);
+
+        /* Option1: the first element is '0' or '1' as well.  */
+        if (adjustment_def)
+          {
+            t = tree_cons (NULL_TREE, def_for_init, t);
+            init_def = build_vector (vectype, t);
+            break;
+          }
+
+        /* Option2: the first element is INIT_VAL.  */
+        t = tree_cons (NULL_TREE, init_value, t);
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      case MIN_EXPR:
+      case MAX_EXPR:
+        if (adjustment_def)
+          {
+            *adjustment_def = NULL_TREE;
+            init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
+            break;
+          }
+
+        for (i = nunits - 1; i >= 0; --i)
+          t = tree_cons (NULL_TREE, init_value, t);
+
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      default:
+        gcc_unreachable ();
+    }
 
   return init_def;
 }
@@ -2613,6 +2762,7 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
    REDUCTION_PHI is the phi-node that carries the reduction computation.
    REDUC_INDEX is the index of the operand in the right hand side of the 
      statement that is defined by REDUCTION_PHI.
+   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
 
    This function:
    1. Creates the reduction def-use cycle: sets the arguments for 
@@ -2657,14 +2807,15 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
                                  int ncopies,
                                  enum tree_code reduc_code,
                                  gimple reduction_phi,
-                                  int reduc_index)
+                                  int reduc_index, 
+                                  bool double_reduc)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info prev_phi_info;
   tree vectype;
   enum machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
   basic_block exit_bb;
   tree scalar_dest;
   tree scalar_type;
@@ -2694,6 +2845,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
   
   if (nested_in_vect_loop_p (loop, stmt))
     {
+      outer_loop = loop;
       loop = loop->inner;
       nested_in_vect_loop = true;
     }
@@ -2726,7 +2878,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
      the scalar def before the loop, that defines the initial value
      of the reduction variable.  */
   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
-                                                 &adjustment_def);
+                                                 &adjustment_def);
 
   phi = reduction_phi;
   def = vect_def;
@@ -2744,8 +2896,8 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
        {
          fprintf (vect_dump, "transform reduction: created def-use cycle: ");
          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
-         fprintf (vect_dump, "\n");
-         print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
+          fprintf (vect_dump, "\n");
+          print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
        }
 
       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
@@ -2831,15 +2983,25 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
   bitsize = TYPE_SIZE (scalar_type);
   bytesize = TYPE_SIZE_UNIT (scalar_type);
 
+  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
+     partial results are added and not subtracted.  */
+  if (code == MINUS_EXPR)
+    code = PLUS_EXPR;
 
   /* In case this is a reduction in an inner-loop while vectorizing an outer
      loop - we don't need to extract a single scalar result at the end of the
-     inner-loop.  The final vector of partial results will be used in the
-     vectorized outer-loop, or reduced to a scalar result at the end of the
-     outer-loop.  */
-  if (nested_in_vect_loop)
+     inner-loop (unless it is double reduction, i.e., the use of reduction is
+     outside the outer-loop). The final vector of partial results will be used 
+     in the vectorized outer-loop, or reduced to a scalar result at the end of
+     the outer-loop.  */
+  if (nested_in_vect_loop && !double_reduc)
     goto vect_finalize_reduction;
 
+  /* The epilogue is created for the outer-loop, i.e., for the loop being
+     vectorized.  */
+  if (double_reduc)
+    loop = outer_loop;
+
   /* FORNOW */
   gcc_assert (ncopies == 1);
 
@@ -2914,6 +3076,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
               bit_offset /= 2)
            {
              tree bitpos = size_int (bit_offset);
+              
              epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
                                                          new_temp, bitpos);
              new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -2987,7 +3150,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
     {
       tree rhs;
 
-      gcc_assert (!nested_in_vect_loop);
+      gcc_assert (!nested_in_vect_loop || double_reduc);
       if (vect_print_dump_info (REPORT_DETAILS))
        fprintf (vect_dump, "extract scalar result");
 
@@ -3007,6 +3170,9 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
 
 vect_finalize_reduction:
 
+  if (double_reduc)
+    loop = loop->inner;
+
   /* 2.5 Adjust the final result by the initial value of the reduction
         variable. (When such adjustment is not needed, then
         'adjustment_def' is zero).  For example, if code is PLUS we create:
@@ -3016,11 +3182,6 @@ vect_finalize_reduction:
     {
       if (nested_in_vect_loop)
        {
-          /* For MINUS_EXPR we create new_temp = loop_exit_def + adjustment_def
-             since the initial value is [0,0,...,0].  */
-          if (code == MINUS_EXPR)
-            code = PLUS_EXPR;
-
          gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
          expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
          new_dest = vect_create_destination_var (scalar_dest, vectype);
@@ -3055,6 +3216,7 @@ vect_finalize_reduction:
          VEC_quick_push (gimple, phis, exit_phi);
        }
     }
+
   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
   gcc_assert (!VEC_empty (gimple, phis));
 
@@ -3063,12 +3225,13 @@ vect_finalize_reduction:
       if (nested_in_vect_loop)
        {
          stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
+          gimple vect_phi;
 
          /* FORNOW. Currently not supporting the case that an inner-loop
             reduction is not used in the outer-loop (but only outside the
-            outer-loop).  */
-         gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 
-                     && !STMT_VINFO_LIVE_P (stmt_vinfo));
+            outer-loop), unless it is double reduction.  */
+         gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) 
+                      && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
 
          epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
          STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
@@ -3078,7 +3241,88 @@ vect_finalize_reduction:
          if (adjustment_def)
            STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
                STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
-         continue;
+
+          if (!double_reduc 
+              || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
+            continue;
+
+          /* Handle double reduction: 
+
+             stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
+             stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
+             stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
+             stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
+
+             At that point the regular reduction (stmt2 and stmt3) is already 
+             vectorized, as well as the exit phi node, stmt4.
+             Here we vectorize the phi node of double reduction, stmt1, and
+             update all relevant statements.  */
+
+          /* Go through all the uses of s2 to find double reduction phi node, 
+             i.e., stmt1 above.  */
+          orig_name = PHI_RESULT (exit_phi);
+          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+            {
+              stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
+              stmt_vec_info new_phi_vinfo;
+              tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
+              basic_block bb = gimple_bb (use_stmt);
+              gimple use;
+
+              /* Check that USE_STMT is really double reduction phi node.  */
+              if (gimple_code (use_stmt) != GIMPLE_PHI
+                  || gimple_phi_num_args (use_stmt) != 2
+                  || !use_stmt_vinfo
+                  || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 
+                      != vect_double_reduction_def
+                  || bb->loop_father != outer_loop)
+                continue;
+
+              /* Create vector phi node for double reduction: 
+                 vs1 = phi <vs0, vs2> 
+                 vs1 was created previously in this function by a call to
+                 vect_get_vec_def_for_operand and is stored in vec_initial_def;
+                 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
+                 vs0 is created here.  */
+
+              /* Create vector phi node.  */
+              vect_phi = create_phi_node (vec_initial_def, bb);
+              new_phi_vinfo = new_stmt_vec_info (vect_phi, 
+                                    loop_vec_info_for_loop (outer_loop), NULL);
+              set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+
+              /* Create vs0 - initial def of the double reduction phi.  */              
+              preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 
+                                             loop_preheader_edge (outer_loop)); 
+              init_def = get_initial_def_for_reduction (stmt, preheader_arg,
+                                                        NULL);
+              vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
+                                                NULL);
+               
+              /* Update phi node arguments with vs0 and vs2.  */
+              add_phi_arg (vect_phi, vect_phi_init, 
+                           loop_preheader_edge (outer_loop));
+              add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), 
+                           loop_latch_edge (outer_loop));
+              if (vect_print_dump_info (REPORT_DETAILS))
+                {
+                  fprintf (vect_dump, "created double reduction phi node: ");
+                  print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+                }
+
+              vect_phi_res = PHI_RESULT (vect_phi);
+
+              /* Replace the use, i.e., set the correct vs1 in the regular
+                 reduction phi node. FORNOW, NCOPIES is always 1, so the loop
+                 is redundant.  */                  
+              use = reduction_phi;
+              for (j = 0; j < ncopies; j++)
+                {
+                  edge pr_edge = loop_preheader_edge (loop);
+                  SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 
+                  use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+                }
+            }
        }
 
       /* Replace the uses:  */
@@ -3087,6 +3331,7 @@ vect_finalize_reduction:
        FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
          SET_USE (use_p, new_temp);
     }
+
   VEC_free (gimple, heap, phis);
 } 
 
@@ -3171,6 +3416,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   gimple reduc_def_stmt = NULL;
   /* The default is that the reduction variable is the last in statement.  */
   int reduc_index = 2;
+  bool double_reduc = false, dummy;
+  basic_block def_bb;
+  struct loop * def_stmt_loop;
+  tree def_arg;
 
   if (nested_in_vect_loop_p (loop, stmt))
     {
@@ -3185,7 +3434,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     return false;
 
   /* 1. Is vectorizable reduction?  */
-
   /* Not supportable if the reduction variable is used in the loop.  */
   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
     return false;
@@ -3300,10 +3548,11 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if (orig_stmt) 
     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, 
                                                        reduc_def_stmt, 
-                                                       !nested_cycle));
+                                                       !nested_cycle, 
+                                                       &dummy));
   else
     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt, 
-                                                  !nested_cycle));
+                                                  !nested_cycle, &dummy));
   
   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
     return false;
@@ -3400,25 +3649,43 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
       orig_code = code;
     }
 
-  if (nested_cycle)
-    epilog_reduc_code = orig_code;
-  else
-    if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
-      return false;
+  if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
+    return false;
 
-  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
+  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, 
+                                     optab_default);
   if (!reduc_optab)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "no optab for reduction.");
       epilog_reduc_code = ERROR_MARK;
     }
-  if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
+
+  if (reduc_optab
+      && optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "reduc op not supported by target.");
       epilog_reduc_code = ERROR_MARK;
     }
+
+  def_bb = gimple_bb (reduc_def_stmt);
+  def_stmt_loop = def_bb->loop_father;
+  def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
+                                   loop_preheader_edge (def_stmt_loop));
+  if (TREE_CODE (def_arg) == SSA_NAME
+      && vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg))
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg)))
+          == vect_double_reduction_def)
+    double_reduc = true;
+
+  if (double_reduc && ncopies > 1)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "multiple types in double reduction");
+
+      return false;
+    }
  
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -3560,8 +3827,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      epilog reduction code.  */
   if (!single_defuse_cycle)
     new_temp = gimple_assign_lhs (*vec_stmt);
+
   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
-                                   epilog_reduc_code, first_phi, reduc_index);
+                                   epilog_reduc_code, first_phi, reduc_index,
+                                    double_reduc);
   return true;
 }
 
index 1c9415b70315902f36712effb5c3023d41cac5a3..891ee1860f09c2f8309faa72d818052da3d2ab9f 100644 (file)
@@ -331,7 +331,7 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
                ...
        inner-loop:
                d = def_stmt
-       outer-loop-tail-bb:
+       outer-loop-tail-bb (or outer-loop-exit-bb in double reduction):
                stmt # use (d)          */
   else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
     {
@@ -341,7 +341,8 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
       switch (relevant)
         {
         case vect_unused_in_scope:
-          relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ?
+          relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def 
+            || STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_double_reduction_def) ?
                       vect_used_in_outer_by_reduction : vect_unused_in_scope;
           break;
 
@@ -393,7 +394,8 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
   basic_block bb;
   gimple phi;
   bool live_p;
-  enum vect_relevant relevant;
+  enum vect_relevant relevant, tmp_relevant;
+  enum vect_def_type def_type;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_mark_stmts_to_be_vectorized ===");
@@ -465,49 +467,64 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
         identify stmts that are used solely by a reduction, and therefore the 
         order of the results that they produce does not have to be kept.  */
 
-      if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
+      def_type = STMT_VINFO_DEF_TYPE (stmt_vinfo);
+      tmp_relevant = relevant;
+      switch (def_type)
         {
-         enum vect_relevant tmp_relevant = relevant;
-         switch (tmp_relevant)
-           {
-           case vect_unused_in_scope:
-             gcc_assert (gimple_code (stmt) != GIMPLE_PHI);
-             relevant = vect_used_by_reduction;
-             break;
+          case vect_reduction_def:
+           switch (tmp_relevant)
+             {
+               case vect_unused_in_scope:
+                 relevant = vect_used_by_reduction;
+                 break;
 
-           case vect_used_by_reduction:
-             if (gimple_code (stmt) == GIMPLE_PHI)
-               break;
-             /* fall through */
+               case vect_used_by_reduction:
+                 if (gimple_code (stmt) == GIMPLE_PHI)
+                    break;
+                 /* fall through */
 
-           default:
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "unsupported use of reduction.");
-             VEC_free (gimple, heap, worklist);
-             return false;
-           }
+               default:
+                 if (vect_print_dump_info (REPORT_DETAILS))
+                   fprintf (vect_dump, "unsupported use of reduction.");
 
-         live_p = false;       
-       }
-      else if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_nested_cycle)
-        {
-          enum vect_relevant tmp_relevant = relevant;
-          switch (tmp_relevant)
-            {
-              case vect_unused_in_scope:
-              case vect_used_in_outer_by_reduction:
-              case vect_used_in_outer:
-                break;
+                 VEC_free (gimple, heap, worklist);
+                 return false;
+             }
 
-              default:
+           live_p = false;     
+           break;
+          case vect_nested_cycle:
+            if (tmp_relevant != vect_unused_in_scope
+                && tmp_relevant != vect_used_in_outer_by_reduction
+                && tmp_relevant != vect_used_in_outer)
+              {
                 if (vect_print_dump_info (REPORT_DETAILS))
                   fprintf (vect_dump, "unsupported use of nested cycle.");
 
                 VEC_free (gimple, heap, worklist);
                 return false;
-            }
+              }
+
+            live_p = false; 
+            break; 
+      
+          case vect_double_reduction_def:
+            if (tmp_relevant != vect_unused_in_scope
+                && tmp_relevant != vect_used_by_reduction)
+              {
+                if (vect_print_dump_info (REPORT_DETAILS))
+                  fprintf (vect_dump, "unsupported use of double reduction.");
+
+                VEC_free (gimple, heap, worklist);
+                return false;
+              }
+
+            live_p = false;
+            break; 
 
-          live_p = false; 
+          default:
+            break;
         }
  
       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
@@ -974,6 +991,7 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
 
     /* Case 4: operand is defined by a loop header phi - reduction  */
     case vect_reduction_def:
+    case vect_double_reduction_def:
     case vect_nested_cycle:
       {
        struct loop *loop;
index 05f5e4783f7406676b1f32f4be524bb4d7bfb22d..c7dab10c13fcb257b944e4a18fa06f8d908999f3 100644 (file)
@@ -61,6 +61,7 @@ enum vect_def_type {
   vect_internal_def,
   vect_induction_def,
   vect_reduction_def,
+  vect_double_reduction_def,
   vect_nested_cycle,
   vect_unknown_def_type
 };
@@ -822,7 +823,7 @@ extern tree vect_create_addr_base_for_vector_ref (gimple, gimple_seq *,
 /* In tree-vect-loop.c.  */
 /* FORNOW: Used in tree-parloops.c.  */
 extern void destroy_loop_vec_info (loop_vec_info, bool);
-extern gimple vect_is_simple_reduction (loop_vec_info, gimple, bool);
+extern gimple vect_is_simple_reduction (loop_vec_info, gimple, bool, bool *);
 /* Drive for loop analysis stage.  */
 extern loop_vec_info vect_analyze_loop (struct loop *);
 /* Drive for loop transformation stage.  */