From 06066f92aaea9e9b0fff53a693b300bb9aa1121a Mon Sep 17 00:00:00 2001
From: Ira Rosen <irar@il.ibm.com>
Date: Sun, 12 Jul 2009 07:09:07 +0000
Subject: [PATCH] tree-parloops.c (loop_parallel_p): Call
 vect_is_simple_reduction with additional argument.

	* tree-parloops.c (loop_parallel_p): Call vect_is_simple_reduction
	with additional argument.
	* tree-vectorizer.h (enum vect_def_type): Add
	vect_double_reduction_def.
	(vect_is_simple_reduction): Add argument.
	* tree-vect-loop.c (vect_determine_vectorization_factor): Fix
	indentation.
	(vect_analyze_scalar_cycles_1): Detect double reduction. Call
	vect_is_simple_reduction with additional argument.
	(vect_analyze_loop_operations): Handle exit phi nodes in case of
	double reduction.
	(reduction_code_for_scalar_code): Handle additional codes by
	returning ERROR_MARK for them. Fix comment and indentation.
	(vect_is_simple_reduction): Fix comment, add argument to specify
	double reduction. Detect double reduction.
	(get_initial_def_for_induction): Fix indentation.
	(get_initial_def_for_reduction): Fix comment and indentation.
	Handle double reduction. Create initial definitions that do not
	require adjustment if ADJUSTMENT_DEF is NULL. Handle additional cases.
	(vect_create_epilog_for_reduction): Fix comment, add argument to
	handle double reduction. Use PLUS_EXPR in case of MINUS_EXPR in
	epilogue result extraction. Create double reduction phi node and
	replace relevant uses.
	(vectorizable_reduction): Call vect_is_simple_reduction with
	additional argument. Fix indentation. Update epilogue code treatment
	according to the changes in reduction_code_for_scalar_code. Check
	for double reduction. Call vect_create_epilog_for_reduction with
	additional argument.
	* tree-vect-stmts.c (process_use): Handle double reduction, update
	documentation.
	(vect_mark_stmts_to_be_vectorized): Handle double reduction.
	(vect_get_vec_def_for_operand): Likewise.

From-SVN: r149526
---
 gcc/ChangeLog                                 |  35 ++
 gcc/testsuite/ChangeLog                       |   8 +
 .../gcc.dg/vect/no-scevccp-outer-2.c          |   4 +-
 .../gcc.dg/vect/vect-double-reduc-1.c         |  56 ++
 .../gcc.dg/vect/vect-double-reduc-2.c         |  56 ++
 .../gcc.dg/vect/vect-double-reduc-3.c         |  67 +++
 .../gcc.dg/vect/vect-double-reduc-4.c         |  56 ++
 .../gcc.dg/vect/vect-double-reduc-5.c         |  58 +++
 .../gcc.dg/vect/vect-double-reduc-6.c         |  50 ++
 .../gcc.dg/vect/vect-double-reduc-7.c         |  65 +++
 gcc/tree-parloops.c                           |   4 +-
 gcc/tree-vect-loop.c                          | 493 ++++++++++++++----
 gcc/tree-vect-stmts.c                         |  88 ++--
 gcc/tree-vectorizer.h                         |   3 +-
 14 files changed, 893 insertions(+), 150 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index c8a39500b80..edeb0492b36 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,38 @@
+2009-07-12  Ira Rosen  <irar@il.ibm.com>
+
+	* tree-parloops.c (loop_parallel_p): Call vect_is_simple_reduction
+	with additional argument.
+	* tree-vectorizer.h (enum vect_def_type): Add 
+	vect_double_reduction_def.
+	(vect_is_simple_reduction): Add argument.
+	* tree-vect-loop.c (vect_determine_vectorization_factor): Fix 
+	indentation.
+	(vect_analyze_scalar_cycles_1): Detect double reduction. Call
+	vect_is_simple_reduction with additional argument.
+	(vect_analyze_loop_operations): Handle exit phi nodes in case of
+	double reduction.
+	(reduction_code_for_scalar_code): Handle additional codes by
+	returning ERROR_MARK for them. Fix comment and indentation.
+	(vect_is_simple_reduction): Fix comment, add argument to specify
+	double reduction. Detect double reduction.
+	(get_initial_def_for_induction): Fix indentation.
+	(get_initial_def_for_reduction): Fix comment and indentation.
+	Handle double reduction. Create initial definitions that do not
+	require adjustment if ADJUSTMENT_DEF is NULL. Handle additional cases.
+	(vect_create_epilog_for_reduction): Fix comment, add argument to
+	handle double reduction. Use PLUS_EXPR in case of MINUS_EXPR in
+	epilogue result extraction. Create double reduction phi node and
+	replace relevant uses.
+	(vectorizable_reduction): Call vect_is_simple_reduction with
+	additional argument. Fix indentation. Update epilogue code treatment
+	according to the changes in reduction_code_for_scalar_code. Check 
+	for double reduction. Call vect_create_epilog_for_reduction with
+	additional argument.
+	* tree-vect-stmts.c (process_use): Handle double reduction, update
+	documentation.
+	(vect_mark_stmts_to_be_vectorized): Handle double reduction.
+	(vect_get_vec_def_for_operand): Likewise.
+
 2009-07-12  Danny Smith  <dansmister@gmail.com>
 
 	* config/i386/winnt.c (i386_pe_determine_dllexport_p): Don't
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 06d7675748b..7df599ed587 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,11 @@
+2009-07-12  Ira Rosen  <irar@il.ibm.com>
+
+	* gcc.dg/vect/no-scevccp-outer-2.c: Expect to vectorize.
+	* gcc.dg/vect/vect-double-reduc-1.c, gcc.dg/vect/vect-double-reduc-2.c,
+	gcc.dg/vect/vect-double-reduc-3.c, gcc.dg/vect/vect-double-reduc-4.c,
+	gcc.dg/vect/vect-double-reduc-5.c, gcc.dg/vect/vect-double-reduc-6.c,
+	gcc.dg/vect/vect-double-reduc-7.c: New tests.
+
 2009-07-12  Hans-Peter Nilsson  <hp@axis.com>
 
 	* gfortran.dg/f2003_io_4.f03, gfortran.dg/read_size_noadvance.f90,
diff --git a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-2.c b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-2.c
index a9ac09c4a2b..13b37883c2e 100644
--- a/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-2.c
+++ b/gcc/testsuite/gcc.dg/vect/no-scevccp-outer-2.c
@@ -1,4 +1,6 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+
 #define N 40
 
 int
@@ -14,5 +16,5 @@ foo (){
   return diff;
 }
 
-/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect"  } } */
 /* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c
new file mode 100644
index 00000000000..e3358428a48
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-1.c
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {642816,660736,678656,696576,714496,732416,750336,768256,786176,804096,822016,839936,857856,875776,893696,911616,929536,947456,965376,983296,1001216,1019136,1037056,1054976,1072896,1090816,1108736,1126656,1144576,1162496,1180416,1198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+ 
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c
new file mode 100644
index 00000000000..be469be02de
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-2.c
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {357184,339264,321344,303424,285504,267584,249664,231744,213824,195904,177984,160064,142144,124224,106304,88384,70464,52544,34624,16704,-1216,-19136,-37056,-54976,-72896,-90816,-108736,-126656,-144576,-162496,-180416,-198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int res = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      res = 1000000;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          res -= in[i+k][j] * coeff[i][j];
+ 
+      out[k] = res;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c
new file mode 100644
index 00000000000..87b5a04099e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-3.c
@@ -0,0 +1,67 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out_max[K], out_min[K];
+int check_max[K] = {62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93};
+int check_min[K] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
+
+__attribute__ ((noinline)) void 
+foo (int x, int y)
+{
+  int max, min, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      max = x;
+      min = y;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++)
+          {
+            max = max < in[i+k][j] ? in[i+k][j] : max; 
+            min = min > in[i+k][j] ? in[i+k][j] : min; 
+          }
+      out_max[k] = max;
+      out_min[k] = min;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo(0, 0);
+
+  for (k = 0; k < K; k++)
+    if (out_max[k] != check_max[k] || out_min[k] != 0)
+      abort ();
+
+  foo(100, 45);
+
+  for (k = 0; k < K; k++)
+    if (out_min[k] != check_min[k] || out_max[k] != 100)
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail vect_no_int_max } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c
new file mode 100644
index 00000000000..90e0da70a20
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-4.c
@@ -0,0 +1,56 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {652816,670736,688656,706576,724496,742416,760336,778256,796176,814096,832016,849936,867856,885776,903696,921616,939536,957456,975376,993296,1011216,1029136,1047056,1064976,1082896,1100816,1118736,1136656,1154576,1172496,1190416,1208336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 10000;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+ 
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
new file mode 100644
index 00000000000..f624d86502f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+signed short in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+signed short coeff[K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {642816,660736,678656,696576,714496,732416,750336,768256,786176,804096,822016,839936,857856,875776,893696,911616,929536,947456,965376,983296,1001216,1019136,1037056,1054976,1072896,1090816,1108736,1126656,1144576,1162496,1180416,1198336};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum = 0, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          sum += in[i+k][j] * coeff[i][j];
+ 
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        coeff[i][j] = i+2;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+
+/* Vectorization of loops with multiple types and double reduction is not 
+   supported yet.  */       
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c
new file mode 100644
index 00000000000..f52b32bfad9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-6.c
@@ -0,0 +1,50 @@
+/* { dg-require-effective-target vect_int_mult } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 4 
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {0,16,256,4096};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int sum;
+  int i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      sum = 1;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++)
+          sum *= in[i+k][j];
+      out[k] = sum;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for (i = 0; i < 2*K; i++)
+    for (j = 0; j < K; j++)
+      in[i][j] = (i+2)/3;
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c
new file mode 100644
index 00000000000..9e7ced7f927
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-double-reduc-7.c
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define K 32
+
+int in[2*K][K] __attribute__ ((__aligned__(__BIGGEST_ALIGNMENT__)));
+int out[K];
+int check_result[K] = {63,63,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191,127,127,191,191};
+
+__attribute__ ((noinline)) void 
+foo ()
+{
+  int res_or, res_and, res_xor, i, j, k;
+
+  for (k = 0; k < K; k++)
+    {
+      res_or = 0;
+      for (j = 0; j < K; j++) 
+        for (i = 0; i < K; i++) 
+          res_or = res_or | in[i+k][j];
+ 
+      res_and = 1;
+      for (j = 0; j < K; j++)
+        for (i = 0; i < K; i++)
+          res_and = res_and & in[i+k][j];
+
+      res_xor = 0;
+      for (j = 0; j < K; j++)
+        for (i = 0; i < K; i++)
+          res_xor = res_xor ^ in[i+k][j];
+
+      out[k] = res_or + res_and + res_xor;
+    }
+}
+
+int main ()
+{
+  int i, j, k;
+
+  check_vect ();
+
+  for  (j = 0; j < K; j++)
+    {
+      for (i = 0; i < 2*K; i++)
+        in[i][j] = i+j;
+
+      for (i = 0; i < K; i++)
+        out[i] = i+j;
+    }
+
+  foo();
+
+  for (k = 0; k < K; k++)
+    if (out[k] != check_result[k])
+      abort ();
+
+  return 0;
+}
+        
+/* { dg-final { scan-tree-dump-times "OUTER LOOP VECTORIZED" 3 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+      
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 5f11fc77a1b..28c96a26f59 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -284,13 +284,15 @@ loop_parallel_p (struct loop *loop, htab_t reduction_list,
     {
       gimple phi = gsi_stmt (gsi);
       gimple reduc_stmt = NULL;
+      bool dummy;
 
       /* ??? TODO: Change this into a generic function that 
          recognizes reductions.  */
       if (!is_gimple_reg (PHI_RESULT (phi)))
 	continue;
       if (simple_loop_info)
-	reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true);
+	reduc_stmt = vect_is_simple_reduction (simple_loop_info, phi, true, 
+                                               &dummy);
 
       /*  Create a reduction_info struct, initialize it and insert it to 
          the reduction list.  */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index a37e3c00f72..c96fb04a814 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -291,8 +291,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 	    }
 	  else
 	    {
-
-	      gcc_assert (! STMT_VINFO_DATA_REF (stmt_info)
+	      gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 			  && !is_pattern_stmt_p (stmt_info));
 
 	      scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, 
@@ -410,6 +409,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
   tree dumy;
   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
   gimple_stmt_iterator gsi;
+  bool double_reduc;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
@@ -477,26 +477,39 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 
       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
-      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle);
+      reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle, 
+                                             &double_reduc);
       if (reduc_stmt)
         {
-          if (nested_cycle)
+          if (double_reduc)
             {
               if (vect_print_dump_info (REPORT_DETAILS))
-                fprintf (vect_dump, "Detected vectorizable nested cycle.");
+                fprintf (vect_dump, "Detected double reduction.");
 
-              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
+              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
-                                                        vect_nested_cycle;
+                                                    vect_double_reduction_def;
             }
-          else
+          else 
             {
-              if (vect_print_dump_info (REPORT_DETAILS))
-                fprintf (vect_dump, "Detected reduction.");
+              if (nested_cycle)
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected vectorizable nested cycle.");
 
-              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
-              STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
-                                                        vect_reduction_def;
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                             vect_nested_cycle;
+                }
+              else
+                {
+                  if (vect_print_dump_info (REPORT_DETAILS))
+                    fprintf (vect_dump, "Detected reduction.");
+
+                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
+                  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
+                                                           vect_reduction_def;
+                }
             }
         }
       else
@@ -1111,10 +1124,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
               /* inner-loop loop-closed exit phi in outer-loop vectorization
                  (i.e. a phi in the tail of the outer-loop).
                  FORNOW: we currently don't support the case that these phis
-                 are not used in the outerloop, cause this case requires
-                 to actually do something here.  */
-              if (!STMT_VINFO_RELEVANT_P (stmt_info)
-                  || STMT_VINFO_LIVE_P (stmt_info))
+                 are not used in the outerloop (unless it is double reduction,
+                 i.e., this phi is vect_reduction_def), cause this case 
+                 requires to actually do something here.  */
+              if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+                   || STMT_VINFO_LIVE_P (stmt_info))
+                  && STMT_VINFO_DEF_TYPE (stmt_info) 
+                     != vect_double_reduction_def)
                 {
                   if (vect_print_dump_info (REPORT_DETAILS))
                     fprintf (vect_dump,
@@ -1466,31 +1482,40 @@ vect_analyze_loop (struct loop *loop)
    Output:
    REDUC_CODE - the corresponding tree-code to be used to reduce the
       vector of partial results into a single scalar result (which
-      will also reside in a vector).
+      will also reside in a vector) or ERROR_MARK if the operation is
+      a supported reduction operation, but does not have such tree-code.
 
-   Return TRUE if a corresponding REDUC_CODE was found, FALSE otherwise.  */
+   Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
 static bool
 reduction_code_for_scalar_code (enum tree_code code,
                                 enum tree_code *reduc_code)
 {
   switch (code)
-  {
-  case MAX_EXPR:
-    *reduc_code = REDUC_MAX_EXPR;
-    return true;
-
-  case MIN_EXPR:
-    *reduc_code = REDUC_MIN_EXPR;
-    return true;
-
-  case PLUS_EXPR:
-    *reduc_code = REDUC_PLUS_EXPR;
-    return true;
-
-  default:
-    return false;
-  }
+    {
+      case MAX_EXPR:
+        *reduc_code = REDUC_MAX_EXPR;
+        return true;
+
+      case MIN_EXPR:
+        *reduc_code = REDUC_MIN_EXPR;
+        return true;
+
+      case PLUS_EXPR:
+        *reduc_code = REDUC_PLUS_EXPR;
+        return true;
+
+      case MULT_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case BIT_AND_EXPR:
+        *reduc_code = ERROR_MARK;
+        return true;
+
+      default:
+       return false;
+    }
 }
 
 
@@ -1507,7 +1532,7 @@ report_vect_op (gimple stmt, const char *msg)
 
 /* Function vect_is_simple_reduction
 
-   Detect a cross-iteration def-use cycle that represents a simple
+   (1) Detect a cross-iteration def-use cycle that represents a simple
    reduction computation. We look for the following pattern:
 
    loop_header:
@@ -1524,12 +1549,20 @@ report_vect_op (gimple stmt, const char *msg)
    Condition 1 is tested here.
    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.  
 
-   Also detect a cross-iteration def-use cycle in nested loops, i.e., nested
-   cycles, if CHECK_REDUCTION is false.  */
+   (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 
+   nested cycles, if CHECK_REDUCTION is false.  
+
+   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
+   reductions:
+
+     a1 = phi < a0, a2 >
+     inner loop (def of a3)
+     a2 = phi < a3 >    
+*/
 
 gimple
 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi, 
-                          bool check_reduction)
+                          bool check_reduction, bool *double_reduc)
 {
   struct loop *loop = (gimple_bb (phi))->loop_father;
   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
@@ -1543,6 +1576,9 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
   tree name;
   imm_use_iterator imm_iter;
   use_operand_p use_p;
+  bool phi_def;
+
+  *double_reduc = false;
 
   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
      otherwise, we assume outer loop vectorization.  */
@@ -1584,14 +1620,24 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
       return NULL;
     }
 
-  if (!is_gimple_assign (def_stmt))
+  if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
       return NULL;
     }
 
-  name = gimple_assign_lhs (def_stmt);
+  if (is_gimple_assign (def_stmt))
+    {
+      name = gimple_assign_lhs (def_stmt);
+      phi_def = false;
+    }
+  else
+    {
+      name = PHI_RESULT (def_stmt);
+      phi_def = true;
+    }
+
   nloop_uses = 0;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
     {
@@ -1608,6 +1654,37 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
 	}
     }
 
+  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
+     defined in the inner loop.  */
+  if (phi_def)
+    {
+      op1 = PHI_ARG_DEF (def_stmt, 0);
+
+      if (gimple_phi_num_args (def_stmt) != 1
+          || TREE_CODE (op1) != SSA_NAME)
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            fprintf (vect_dump, "unsupported phi node definition.");
+
+          return NULL;
+        }
+
+      def1 = SSA_NAME_DEF_STMT (op1); 
+      if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 
+          && loop->inner
+          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
+          && is_gimple_assign (def1))
+        {
+          if (vect_print_dump_info (REPORT_DETAILS))
+            report_vect_op (def_stmt, "detected double reduction: ");
+ 
+          *double_reduc = true;
+          return def_stmt;
+        }
+
+      return NULL;
+    }
+
   code = gimple_assign_rhs_code (def_stmt);
 
   if (check_reduction 
@@ -1697,7 +1774,6 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
       return NULL;
     }
 
-
   /* Check that one def is the reduction def, defined by PHI,
      the other def is either defined in the loop ("vect_internal_def"),
      or it's an induction (defined by a loop-header phi-node).  */
@@ -2296,7 +2372,7 @@ get_initial_def_for_induction (gimple iv_phi)
   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
   gcc_assert (access_fn);
   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
-                                  &init_expr, &step_expr);
+                                    &init_expr, &step_expr);
   gcc_assert (ok);
   pe = loop_preheader_edge (iv_loop);
 
@@ -2306,7 +2382,8 @@ get_initial_def_for_induction (gimple iv_phi)
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
 	 been created during vectorization of previous stmts; We obtain it from
 	 the STMT_VINFO_VEC_STMT of the defining stmt. */
-      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop));
+      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, 
+                                           loop_preheader_edge (iv_loop));
       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
     }
   else
@@ -2507,18 +2584,16 @@ get_initial_def_for_induction (gimple iv_phi)
         vector of partial results.
 
    Option1 (adjust in epilog): Initialize the vector as follows:
-     add:         [0,0,...,0,0]
-     mult:        [1,1,...,1,1]
-     min/max:     [init_val,init_val,..,init_val,init_val]
-     bit and/or:  [init_val,init_val,..,init_val,init_val]
+     add/bit or/xor: [0,0,...,0,0]
+     mult/bit and:   [1,1,...,1,1]
+     min/max:        [init_val,init_val,..,init_val,init_val]
    and when necessary (e.g. add/mult case) let the caller know
    that it needs to adjust the result by init_val.
 
    Option2: Initialize the vector as follows:
-     add:         [0,0,...,0,init_val]
-     mult:        [1,1,...,1,init_val]
-     min/max:     [init_val,init_val,...,init_val]
-     bit and/or:  [init_val,init_val,...,init_val]
+     add/bit or/xor: [init_val,0,0,...,0]
+     mult/bit and:   [init_val,1,1,...,1]
+     min/max:        [init_val,init_val,...,init_val]
    and no adjustments are needed.
 
    For example, for the following code:
@@ -2533,11 +2608,14 @@ get_initial_def_for_induction (gimple iv_phi)
    the result at the end by 'init_val'.
 
    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
-   initialization vector is simpler (same element in all entries).
+   initialization vector is simpler (same element in all entries), if
+   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
+   
    A cost model should help decide between these two schemes.  */
 
 tree
-get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
+get_initial_def_for_reduction (gimple stmt, tree init_val, 
+                               tree *adjustment_def)
 {
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
@@ -2551,47 +2629,118 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
   tree t = NULL_TREE;
   int i;
   bool nested_in_vect_loop = false; 
+  tree init_value;
+  REAL_VALUE_TYPE real_init_val = dconst0;
+  int int_init_val = 0;
 
   gcc_assert (vectype);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
 	      || SCALAR_FLOAT_TYPE_P (scalar_type));
+
   if (nested_in_vect_loop_p (loop, stmt))
     nested_in_vect_loop = true;
   else
     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
 
-  switch (code)
-  {
-  case WIDEN_SUM_EXPR:
-  case DOT_PROD_EXPR:
-  case PLUS_EXPR:
-  case MINUS_EXPR:
-    if (nested_in_vect_loop)
-      *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    else
-      *adjustment_def = init_val;
-    /* Create a vector of zeros for init_def.  */
-    if (SCALAR_FLOAT_TYPE_P (scalar_type))
-      def_for_init = build_real (scalar_type, dconst0);
-    else
-      def_for_init = build_int_cst (scalar_type, 0);
-      
-    for (i = nunits - 1; i >= 0; --i)
-      t = tree_cons (NULL_TREE, def_for_init, t);
-    init_def = build_vector (vectype, t);
-    break;
+  /* In case of double reduction we only create a vector variable to be put
+     in the reduction phi node. The actual statement creation is done in
+     vect_create_epilog_for_reduction.  */
+  if (TREE_CODE (init_val) == SSA_NAME
+      && vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val)) 
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val))) 
+          == vect_double_reduction_def)
+    {
+      *adjustment_def = NULL;
+      return vect_create_destination_var (init_val, vectype);
+    }
 
-  case MIN_EXPR:
-  case MAX_EXPR:
-    *adjustment_def = NULL_TREE;
-    init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
-    break;
+  if (TREE_CONSTANT (init_val))
+    {
+      if (SCALAR_FLOAT_TYPE_P (scalar_type))
+        init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
+      else
+        init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
+    }
+  else
+    init_value = init_val;
 
-  default:
-    gcc_unreachable ();
-  }
+  switch (code)
+    {
+      case WIDEN_SUM_EXPR:
+      case DOT_PROD_EXPR:
+      case PLUS_EXPR:
+      case MINUS_EXPR:
+      case BIT_IOR_EXPR:
+      case BIT_XOR_EXPR:
+      case MULT_EXPR:
+      case BIT_AND_EXPR:
+        /* ADJUSMENT_DEF is NULL when called from 
+           vect_create_epilog_for_reduction to vectorize double reduction.  */
+        if (adjustment_def)
+          {
+            if (nested_in_vect_loop)
+              *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, 
+                                                              NULL);
+            else
+              *adjustment_def = init_val;
+          }
+
+        if (code == MULT_EXPR || code == BIT_AND_EXPR)
+          {
+            real_init_val = dconst1;
+            int_init_val = 1;
+          }
+
+        if (SCALAR_FLOAT_TYPE_P (scalar_type))
+          def_for_init = build_real (scalar_type, real_init_val);
+        else
+          def_for_init = build_int_cst (scalar_type, int_init_val);
+
+        /* Create a vector of '0' or '1' except the first element.  */ 
+        for (i = nunits - 2; i >= 0; --i)
+          t = tree_cons (NULL_TREE, def_for_init, t);
+
+        /* Option1: the first element is '0' or '1' as well.  */
+        if (adjustment_def)
+          {
+            t = tree_cons (NULL_TREE, def_for_init, t);
+            init_def = build_vector (vectype, t);
+            break;
+          }
+
+        /* Option2: the first element is INIT_VAL.  */
+        t = tree_cons (NULL_TREE, init_value, t);
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      case MIN_EXPR:
+      case MAX_EXPR:
+        if (adjustment_def)
+          {
+            *adjustment_def = NULL_TREE;
+            init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
+            break;
+          }
+
+        for (i = nunits - 1; i >= 0; --i)
+          t = tree_cons (NULL_TREE, init_value, t);
+
+        if (TREE_CONSTANT (init_val))
+          init_def = build_vector (vectype, t);
+        else
+          init_def = build_constructor_from_list (vectype, t);
+
+        break;
+
+      default:
+        gcc_unreachable ();
+    }
 
   return init_def;
 }
@@ -2613,6 +2762,7 @@ get_initial_def_for_reduction (gimple stmt, tree init_val, tree *adjustment_def)
    REDUCTION_PHI is the phi-node that carries the reduction computation.
    REDUC_INDEX is the index of the operand in the right hand side of the 
      statement that is defined by REDUCTION_PHI.
+   DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
 
    This function:
    1. Creates the reduction def-use cycle: sets the arguments for 
@@ -2657,14 +2807,15 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
 				  int ncopies,
 				  enum tree_code reduc_code,
 				  gimple reduction_phi,
-                                  int reduc_index)
+                                  int reduc_index, 
+                                  bool double_reduc)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info prev_phi_info;
   tree vectype;
   enum machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
   basic_block exit_bb;
   tree scalar_dest;
   tree scalar_type;
@@ -2694,6 +2845,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
   
   if (nested_in_vect_loop_p (loop, stmt))
     {
+      outer_loop = loop;
       loop = loop->inner;
       nested_in_vect_loop = true;
     }
@@ -2726,7 +2878,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
      the scalar def before the loop, that defines the initial value
      of the reduction variable.  */
   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
-						  &adjustment_def);
+					          &adjustment_def);
 
   phi = reduction_phi;
   def = vect_def;
@@ -2744,8 +2896,8 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
 	{
 	  fprintf (vect_dump, "transform reduction: created def-use cycle: ");
 	  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
-	  fprintf (vect_dump, "\n");
-	  print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
+          fprintf (vect_dump, "\n");
+          print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
 	}
 
       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
@@ -2831,15 +2983,25 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
   bitsize = TYPE_SIZE (scalar_type);
   bytesize = TYPE_SIZE_UNIT (scalar_type);
 
+  /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
+     partial results are added and not subtracted.  */
+  if (code == MINUS_EXPR)
+    code = PLUS_EXPR;
 
   /* In case this is a reduction in an inner-loop while vectorizing an outer
      loop - we don't need to extract a single scalar result at the end of the
-     inner-loop.  The final vector of partial results will be used in the
-     vectorized outer-loop, or reduced to a scalar result at the end of the
-     outer-loop.  */
-  if (nested_in_vect_loop)
+     inner-loop (unless it is double reduction, i.e., the use of reduction is
+     outside the outer-loop). The final vector of partial results will be used 
+     in the vectorized outer-loop, or reduced to a scalar result at the end of
+     the outer-loop.  */
+  if (nested_in_vect_loop && !double_reduc)
     goto vect_finalize_reduction;
 
+  /* The epilogue is created for the outer-loop, i.e., for the loop being
+     vectorized.  */
+  if (double_reduc)
+    loop = outer_loop;
+
   /* FORNOW */
   gcc_assert (ncopies == 1);
 
@@ -2914,6 +3076,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
 	       bit_offset /= 2)
 	    {
 	      tree bitpos = size_int (bit_offset);
+              
 	      epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
 							  new_temp, bitpos);
 	      new_name = make_ssa_name (vec_dest, epilog_stmt);
@@ -2987,7 +3150,7 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
     {
       tree rhs;
 
-      gcc_assert (!nested_in_vect_loop);
+      gcc_assert (!nested_in_vect_loop || double_reduc);
       if (vect_print_dump_info (REPORT_DETAILS))
 	fprintf (vect_dump, "extract scalar result");
 
@@ -3007,6 +3170,9 @@ vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
 
 vect_finalize_reduction:
 
+  if (double_reduc)
+    loop = loop->inner;
+
   /* 2.5 Adjust the final result by the initial value of the reduction
 	 variable. (When such adjustment is not needed, then
 	 'adjustment_def' is zero).  For example, if code is PLUS we create:
@@ -3016,11 +3182,6 @@ vect_finalize_reduction:
     {
       if (nested_in_vect_loop)
 	{
-          /* For MINUS_EXPR we create new_temp = loop_exit_def + adjustment_def
-             since the initial value is [0,0,...,0].  */
-          if (code == MINUS_EXPR)
-            code = PLUS_EXPR;
-
 	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
 	  expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
 	  new_dest = vect_create_destination_var (scalar_dest, vectype);
@@ -3055,6 +3216,7 @@ vect_finalize_reduction:
 	  VEC_quick_push (gimple, phis, exit_phi);
 	}
     }
+
   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
   gcc_assert (!VEC_empty (gimple, phis));
 
@@ -3063,12 +3225,13 @@ vect_finalize_reduction:
       if (nested_in_vect_loop)
 	{
 	  stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
+          gimple vect_phi;
 
 	  /* FORNOW. Currently not supporting the case that an inner-loop
 	     reduction is not used in the outer-loop (but only outside the
-	     outer-loop).  */
-	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 
-		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
+	     outer-loop), unless it is double reduction.  */
+	  gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) 
+                      && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
 
 	  epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
 	  STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
@@ -3078,7 +3241,88 @@ vect_finalize_reduction:
 	  if (adjustment_def)
 	    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
 		STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
-	  continue;
+
+          if (!double_reduc 
+              || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
+            continue;
+
+          /* Handle double reduction: 
+
+             stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
+             stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
+             stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
+             stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
+
+             At that point the regular reduction (stmt2 and stmt3) is already 
+             vectorized, as well as the exit phi node, stmt4.
+             Here we vectorize the phi node of double reduction, stmt1, and
+             update all relevant statements.  */
+
+          /* Go through all the uses of s2 to find double reduction phi node, 
+             i.e., stmt1 above.  */
+          orig_name = PHI_RESULT (exit_phi);
+          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
+            {
+              stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
+              stmt_vec_info new_phi_vinfo;
+              tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
+              basic_block bb = gimple_bb (use_stmt);
+              gimple use;
+
+              /* Check that USE_STMT is really double reduction phi node.  */
+              if (gimple_code (use_stmt) != GIMPLE_PHI
+                  || gimple_phi_num_args (use_stmt) != 2
+                  || !use_stmt_vinfo
+                  || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) 
+                      != vect_double_reduction_def
+                  || bb->loop_father != outer_loop)
+                continue;
+
+              /* Create vector phi node for double reduction: 
+                 vs1 = phi <vs0, vs2> 
+                 vs1 was created previously in this function by a call to
+                 vect_get_vec_def_for_operand and is stored in vec_initial_def;
+                 vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
+                 vs0 is created here.  */
+
+              /* Create vector phi node.  */
+              vect_phi = create_phi_node (vec_initial_def, bb);
+              new_phi_vinfo = new_stmt_vec_info (vect_phi, 
+                                    loop_vec_info_for_loop (outer_loop), NULL);
+              set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
+
+              /* Create vs0 - initial def of the double reduction phi.  */              
+              preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, 
+                                             loop_preheader_edge (outer_loop)); 
+              init_def = get_initial_def_for_reduction (stmt, preheader_arg,
+                                                        NULL);
+              vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
+                                                NULL);
+               
+              /* Update phi node arguments with vs0 and vs2.  */
+              add_phi_arg (vect_phi, vect_phi_init, 
+                           loop_preheader_edge (outer_loop));
+              add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), 
+                           loop_latch_edge (outer_loop));
+              if (vect_print_dump_info (REPORT_DETAILS))
+                {
+                  fprintf (vect_dump, "created double reduction phi node: ");
+                  print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+                }
+
+              vect_phi_res = PHI_RESULT (vect_phi);
+
+              /* Replace the use, i.e., set the correct vs1 in the regular
+                 reduction phi node. FORNOW, NCOPIES is always 1, so the loop
+                 is redundant.  */                  
+              use = reduction_phi;
+              for (j = 0; j < ncopies; j++)
+                {
+                  edge pr_edge = loop_preheader_edge (loop);
+                  SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); 
+                  use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
+                }
+            }
 	}
 
       /* Replace the uses:  */
@@ -3087,6 +3331,7 @@ vect_finalize_reduction:
 	FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 	  SET_USE (use_p, new_temp);
     }
+
   VEC_free (gimple, heap, phis);
 } 
 
@@ -3171,6 +3416,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   gimple reduc_def_stmt = NULL;
   /* The default is that the reduction variable is the last in statement.  */
   int reduc_index = 2;
+  bool double_reduc = false, dummy;
+  basic_block def_bb;
+  struct loop * def_stmt_loop;
+  tree def_arg;
 
   if (nested_in_vect_loop_p (loop, stmt))
     {
@@ -3185,7 +3434,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     return false;
 
   /* 1. Is vectorizable reduction?  */
-
   /* Not supportable if the reduction variable is used in the loop.  */
   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
     return false;
@@ -3300,10 +3548,11 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if (orig_stmt) 
     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, 
                                                        reduc_def_stmt, 
-                                                       !nested_cycle));
+                                                       !nested_cycle, 
+                                                       &dummy));
   else
     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt, 
-                                                  !nested_cycle));
+                                                  !nested_cycle, &dummy));
   
   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
     return false;
@@ -3400,25 +3649,43 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
       orig_code = code;
     }
 
-  if (nested_cycle)
-    epilog_reduc_code = orig_code;
-  else
-    if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
-      return false;
+  if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
+    return false;
 
-  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, optab_default);
+  reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, 
+                                     optab_default);
   if (!reduc_optab)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "no optab for reduction.");
       epilog_reduc_code = ERROR_MARK;
     }
-  if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
+
+  if (reduc_optab
+      && optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
     {
       if (vect_print_dump_info (REPORT_DETAILS))
         fprintf (vect_dump, "reduc op not supported by target.");
       epilog_reduc_code = ERROR_MARK;
     }
+
+  def_bb = gimple_bb (reduc_def_stmt);
+  def_stmt_loop = def_bb->loop_father;
+  def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
+                                   loop_preheader_edge (def_stmt_loop));
+  if (TREE_CODE (def_arg) == SSA_NAME
+      && vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg))
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg)))
+          == vect_double_reduction_def)
+    double_reduc = true;
+
+  if (double_reduc && ncopies > 1)
+    {
+      if (vect_print_dump_info (REPORT_DETAILS))
+        fprintf (vect_dump, "multiple types in double reduction");
+
+      return false;
+    }
  
   if (!vec_stmt) /* transformation not required.  */
     {
@@ -3560,8 +3827,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      epilog reduction code.  */
   if (!single_defuse_cycle)
     new_temp = gimple_assign_lhs (*vec_stmt);
+
   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
-				    epilog_reduc_code, first_phi, reduc_index);
+				    epilog_reduc_code, first_phi, reduc_index,
+                                    double_reduc);
   return true;
 }
 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1c9415b7031..891ee1860f0 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -331,7 +331,7 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
 		...
 	inner-loop:
 		d = def_stmt
-	outer-loop-tail-bb:
+	outer-loop-tail-bb (or outer-loop-exit-bb in double reduction):
 		stmt # use (d)		*/
   else if (flow_loop_nested_p (bb->loop_father, def_bb->loop_father))
     {
@@ -341,7 +341,8 @@ process_use (gimple stmt, tree use, loop_vec_info loop_vinfo, bool live_p,
       switch (relevant)
         {
         case vect_unused_in_scope:
-          relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def) ?
+          relevant = (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def 
+            || STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_double_reduction_def) ?
                       vect_used_in_outer_by_reduction : vect_unused_in_scope;
           break;
 
@@ -393,7 +394,8 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
   basic_block bb;
   gimple phi;
   bool live_p;
-  enum vect_relevant relevant;
+  enum vect_relevant relevant, tmp_relevant;
+  enum vect_def_type def_type;
 
   if (vect_print_dump_info (REPORT_DETAILS))
     fprintf (vect_dump, "=== vect_mark_stmts_to_be_vectorized ===");
@@ -465,49 +467,64 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info loop_vinfo)
 	 identify stmts that are used solely by a reduction, and therefore the 
 	 order of the results that they produce does not have to be kept.  */
 
-      if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def)
+      def_type = STMT_VINFO_DEF_TYPE (stmt_vinfo);
+      tmp_relevant = relevant;
+      switch (def_type)
         {
-	  enum vect_relevant tmp_relevant = relevant;
-	  switch (tmp_relevant)
-	    {
-	    case vect_unused_in_scope:
-	      gcc_assert (gimple_code (stmt) != GIMPLE_PHI);
-	      relevant = vect_used_by_reduction;
-	      break;
+          case vect_reduction_def:
+	    switch (tmp_relevant)
+	      {
+	        case vect_unused_in_scope:
+	          relevant = vect_used_by_reduction;
+	          break;
 
-	    case vect_used_by_reduction:
-	      if (gimple_code (stmt) == GIMPLE_PHI)
-		break;
-	      /* fall through */
+	        case vect_used_by_reduction:
+	          if (gimple_code (stmt) == GIMPLE_PHI)
+                    break;
+  	          /* fall through */
 
-	    default:
-	      if (vect_print_dump_info (REPORT_DETAILS))
-	        fprintf (vect_dump, "unsupported use of reduction.");
-	      VEC_free (gimple, heap, worklist);
-	      return false;
-	    }
+	        default:
+	          if (vect_print_dump_info (REPORT_DETAILS))
+	            fprintf (vect_dump, "unsupported use of reduction.");
 
-	  live_p = false;	
-	}
-      else if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_nested_cycle)
-        {
-          enum vect_relevant tmp_relevant = relevant;
-          switch (tmp_relevant)
-            {
-              case vect_unused_in_scope:
-              case vect_used_in_outer_by_reduction:
-              case vect_used_in_outer:
-                break;
+  	          VEC_free (gimple, heap, worklist);
+	          return false;
+	      }
 
-              default:
+	    live_p = false;	
+	    break;
+ 
+          case vect_nested_cycle:
+            if (tmp_relevant != vect_unused_in_scope
+                && tmp_relevant != vect_used_in_outer_by_reduction
+                && tmp_relevant != vect_used_in_outer)
+              {
                 if (vect_print_dump_info (REPORT_DETAILS))
                   fprintf (vect_dump, "unsupported use of nested cycle.");
 
                 VEC_free (gimple, heap, worklist);
                 return false;
-            }
+              }
+
+            live_p = false; 
+            break; 
+      
+          case vect_double_reduction_def:
+            if (tmp_relevant != vect_unused_in_scope
+                && tmp_relevant != vect_used_by_reduction)
+              {
+                if (vect_print_dump_info (REPORT_DETAILS))
+                  fprintf (vect_dump, "unsupported use of double reduction.");
+
+                VEC_free (gimple, heap, worklist);
+                return false;
+              }
+
+            live_p = false;
+            break; 
 
-          live_p = false; 
+          default:
+            break;
         }
  
       FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, iter, SSA_OP_USE)
@@ -974,6 +991,7 @@ vect_get_vec_def_for_operand (tree op, gimple stmt, tree *scalar_def)
 
     /* Case 4: operand is defined by a loop header phi - reduction  */
     case vect_reduction_def:
+    case vect_double_reduction_def:
     case vect_nested_cycle:
       {
 	struct loop *loop;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 05f5e4783f7..c7dab10c13f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -61,6 +61,7 @@ enum vect_def_type {
   vect_internal_def,
   vect_induction_def,
   vect_reduction_def,
+  vect_double_reduction_def,
   vect_nested_cycle,
   vect_unknown_def_type
 };
@@ -822,7 +823,7 @@ extern tree vect_create_addr_base_for_vector_ref (gimple, gimple_seq *,
 /* In tree-vect-loop.c.  */
 /* FORNOW: Used in tree-parloops.c.  */
 extern void destroy_loop_vec_info (loop_vec_info, bool);
-extern gimple vect_is_simple_reduction (loop_vec_info, gimple, bool);
+extern gimple vect_is_simple_reduction (loop_vec_info, gimple, bool, bool *);
 /* Drive for loop analysis stage.  */
 extern loop_vec_info vect_analyze_loop (struct loop *);
 /* Drive for loop transformation stage.  */
-- 
2.30.2