From: Richard Sandiford <richard.sandiford@linaro.org>
Date: Thu, 12 Jul 2018 13:02:00 +0000 (+0000)
Subject: Use conditional internal functions in if-conversion
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2c58d42c3ed599b4c2976fc173eefd8e016ea216;p=gcc.git

Use conditional internal functions in if-conversion

This patch uses IFN_COND_* to vectorise conditionally-executed,
potentially-trapping arithmetic, such as most floating-point
ops with -ftrapping-math.  E.g.:

    if (cond) { ... x = a + b; ... }

becomes:

    ...
    x = .COND_ADD (cond, a, b, else_value);
    ...

When this transformation is done on its own, the value of x for
!cond isn't important, so else_value is simply the target's
preferred_else_value (i.e. the value it can handle the most
efficiently).

However, the patch also looks for the equivalent of:

    y = cond ? x : c;

in which the "then" value is the result of the conditionally-executed
operation and the "else" value "c" is some value that is available at x.
In that case we can instead use:

    x = .COND_ADD (cond, a, b, c);

and replace uses of y with uses of x.

The patch also looks for:

    y = !cond ? c : x;

which can be transformed in the same way.  This involved adding a new
utility function inverse_conditions_p, which was already open-coded
in a more limited way in match.pd.

2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>

gcc/
	* fold-const.h (inverse_conditions_p): Declare.
	* fold-const.c (inverse_conditions_p): New function.
	* match.pd: Use inverse_conditions_p.  Add folds of view_converts
	that test the inverse condition of a conditional internal function.
	* internal-fn.h (vectorized_internal_fn_supported_p): Declare.
	* internal-fn.c (internal_fn_mask_index): Handle conditional
	internal functions.
	(vectorized_internal_fn_supported_p): New function.
	* tree-if-conv.c: Include internal-fn.h and fold-const.h.
	(any_pred_load_store): Replace with...
	(need_to_predicate): ...this new variable.
	(redundant_ssa_names): New variable.
	(ifcvt_can_use_mask_load_store): Move initial checks to...
	(ifcvt_can_predicate): ...this new function.  Handle tree codes
	for which a conditional internal function exists.
	(if_convertible_gimple_assign_stmt_p): Use ifcvt_can_predicate
	instead of ifcvt_can_use_mask_load_store.  Update after variable
	name change.
	(predicate_load_or_store): New function, split out from
	predicate_mem_writes.
	(check_redundant_cond_expr): New function.
	(value_available_p): Likewise.
	(predicate_rhs_code): Likewise.
	(predicate_mem_writes): Rename to...
	(predicate_statements): ...this.  Use predicate_load_or_store
	and predicate_rhs_code.
	(combine_blocks, tree_if_conversion): Update after above name changes.
	(ifcvt_local_dce): Handle redundant_ssa_names.
	* tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Handle
	general conditional functions.
	* tree-vect-stmts.c (vectorizable_call): Likewise.

gcc/testsuite/
	* gcc.dg/vect/vect-cond-arith-4.c: New test.
	* gcc.dg/vect/vect-cond-arith-5.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_1.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_2.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_2_run.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
	* gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.

From-SVN: r262589
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 20ed355ca12..a77b04ccf6b 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,37 @@
+2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>
+
+	* fold-const.h (inverse_conditions_p): Declare.
+	* fold-const.c (inverse_conditions_p): New function.
+	* match.pd: Use inverse_conditions_p.  Add folds of view_converts
+	that test the inverse condition of a conditional internal function.
+	* internal-fn.h (vectorized_internal_fn_supported_p): Declare.
+	* internal-fn.c (internal_fn_mask_index): Handle conditional
+	internal functions.
+	(vectorized_internal_fn_supported_p): New function.
+	* tree-if-conv.c: Include internal-fn.h and fold-const.h.
+	(any_pred_load_store): Replace with...
+	(need_to_predicate): ...this new variable.
+	(redundant_ssa_names): New variable.
+	(ifcvt_can_use_mask_load_store): Move initial checks to...
+	(ifcvt_can_predicate): ...this new function.  Handle tree codes
+	for which a conditional internal function exists.
+	(if_convertible_gimple_assign_stmt_p): Use ifcvt_can_predicate
+	instead of ifcvt_can_use_mask_load_store.  Update after variable
+	name change.
+	(predicate_load_or_store): New function, split out from
+	predicate_mem_writes.
+	(check_redundant_cond_expr): New function.
+	(value_available_p): Likewise.
+	(predicate_rhs_code): Likewise.
+	(predicate_mem_writes): Rename to...
+	(predicate_statements): ...this.  Use predicate_load_or_store
+	and predicate_rhs_code.
+	(combine_blocks, tree_if_conversion): Update after above name changes.
+	(ifcvt_local_dce): Handle redundant_ssa_names.
+	* tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Handle
+	general conditional functions.
+	* tree-vect-stmts.c (vectorizable_call): Likewise.
+
 2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
diff --git a/gcc/fold-const.c b/gcc/fold-const.c
index ac65dcfaf1d..1197d75ec75 100644
--- a/gcc/fold-const.c
+++ b/gcc/fold-const.c
@@ -2787,6 +2787,22 @@ compcode_to_comparison (enum comparison_code code)
     }
 }
 
+/* Return true if COND1 tests the opposite condition of COND2.  */
+
+bool
+inverse_conditions_p (const_tree cond1, const_tree cond2)
+{
+  return (COMPARISON_CLASS_P (cond1)
+	  && COMPARISON_CLASS_P (cond2)
+	  && (invert_tree_comparison
+	      (TREE_CODE (cond1),
+	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
+	  && operand_equal_p (TREE_OPERAND (cond1, 0),
+			      TREE_OPERAND (cond2, 0), 0)
+	  && operand_equal_p (TREE_OPERAND (cond1, 1),
+			      TREE_OPERAND (cond2, 1), 0));
+}
+
 /* Return a tree for the comparison which is the combination of
    doing the AND or OR (depending on CODE) of the two operations LCODE
    and RCODE on the identical operands LL_ARG and LR_ARG.  Take into account
diff --git a/gcc/fold-const.h b/gcc/fold-const.h
index 4613a62e1f6..b3225752385 100644
--- a/gcc/fold-const.h
+++ b/gcc/fold-const.h
@@ -127,6 +127,7 @@ extern enum tree_code swap_tree_comparison (enum tree_code);
 
 extern bool ptr_difference_const (tree, tree, poly_int64_pod *);
 extern enum tree_code invert_tree_comparison (enum tree_code, bool);
+extern bool inverse_conditions_p (const_tree, const_tree);
 
 extern bool tree_unary_nonzero_warnv_p (enum tree_code, tree, tree, bool *);
 extern bool tree_binary_nonzero_warnv_p (enum tree_code, tree, tree, tree op1,
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 15755ea06fd..8ede9cac3ef 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -3466,7 +3466,8 @@ internal_fn_mask_index (internal_fn fn)
       return 4;
 
     default:
-      return -1;
+      return (conditional_internal_fn_code (fn) != ERROR_MARK
+	      || get_unconditional_internal_fn (fn) != IFN_LAST ? 0 : -1);
     }
 }
 
@@ -3531,6 +3532,26 @@ expand_internal_call (gcall *stmt)
   expand_internal_call (gimple_call_internal_fn (stmt), stmt);
 }
 
+/* If TYPE is a vector type, return true if IFN is a direct internal
+   function that is supported for that type.  If TYPE is a scalar type,
+   return true if IFN is a direct internal function that is supported for
+   the target's preferred vector version of TYPE.  */
+
+bool
+vectorized_internal_fn_supported_p (internal_fn ifn, tree type)
+{
+  scalar_mode smode;
+  if (!VECTOR_TYPE_P (type) && is_a <scalar_mode> (TYPE_MODE (type), &smode))
+    {
+      machine_mode vmode = targetm.vectorize.preferred_simd_mode (smode);
+      if (VECTOR_MODE_P (vmode))
+	type = build_vector_type_for_mode (type, vmode);
+    }
+
+  return (VECTOR_MODE_P (TYPE_MODE (type))
+	  && direct_internal_fn_supported_p (ifn, type, OPTIMIZE_FOR_SPEED));
+}
+
 void
 expand_PHI (internal_fn, gcall *)
 {
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index 2296ca0c539..48db7721aa8 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -212,4 +212,6 @@ extern void expand_internal_call (gcall *);
 extern void expand_internal_call (internal_fn, gcall *);
 extern void expand_PHI (internal_fn, gcall *);
 
+extern bool vectorized_internal_fn_supported_p (internal_fn, tree);
+
 #endif
diff --git a/gcc/match.pd b/gcc/match.pd
index dea6cde49ab..1a8ad446585 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -2951,21 +2951,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
     from if-conversion.  */
  (simplify
   (cnd @0 @1 (cnd @2 @3 @4))
-  (if (COMPARISON_CLASS_P (@0)
-       && COMPARISON_CLASS_P (@2)
-       && invert_tree_comparison
-           (TREE_CODE (@0), HONOR_NANS (TREE_OPERAND (@0, 0))) == TREE_CODE (@2)
-       && operand_equal_p (TREE_OPERAND (@0, 0), TREE_OPERAND (@2, 0), 0)
-       && operand_equal_p (TREE_OPERAND (@0, 1), TREE_OPERAND (@2, 1), 0))
+  (if (inverse_conditions_p (@0, @2))
    (cnd @0 @1 @3)))
  (simplify
   (cnd @0 (cnd @1 @2 @3) @4)
-  (if (COMPARISON_CLASS_P (@0)
-       && COMPARISON_CLASS_P (@1)
-       && invert_tree_comparison
-           (TREE_CODE (@0), HONOR_NANS (TREE_OPERAND (@0, 0))) == TREE_CODE (@1)
-       && operand_equal_p (TREE_OPERAND (@0, 0), TREE_OPERAND (@1, 0), 0)
-       && operand_equal_p (TREE_OPERAND (@0, 1), TREE_OPERAND (@1, 1), 0))
+  (if (inverse_conditions_p (@0, @1))
    (cnd @0 @3 @4)))
 
  /* A ? B : B -> B.  */
@@ -4913,7 +4903,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (vec_cond @0 (view_convert? (cond_op @0 @1 @2 @3)) @4)
   (with { tree op_type = TREE_TYPE (@3); }
    (if (element_precision (type) == element_precision (op_type))
-    (view_convert (cond_op @0 @1 @2 (view_convert:op_type @4)))))))
+    (view_convert (cond_op @0 @1 @2 (view_convert:op_type @4))))))
+ (simplify
+  (vec_cond @0 @1 (view_convert? (cond_op @2 @3 @4 @5)))
+  (with { tree op_type = TREE_TYPE (@5); }
+   (if (inverse_conditions_p (@0, @2)
+        && element_precision (type) == element_precision (op_type))
+    (view_convert (cond_op @2 @3 @4 (view_convert:op_type @1)))))))
 
 /* Same for ternary operations.  */
 (for cond_op (COND_TERNARY)
@@ -4921,4 +4917,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (vec_cond @0 (view_convert? (cond_op @0 @1 @2 @3 @4)) @5)
   (with { tree op_type = TREE_TYPE (@4); }
    (if (element_precision (type) == element_precision (op_type))
-    (view_convert (cond_op @0 @1 @2 @3 (view_convert:op_type @5)))))))
+    (view_convert (cond_op @0 @1 @2 @3 (view_convert:op_type @5))))))
+ (simplify
+  (vec_cond @0 @1 (view_convert? (cond_op @2 @3 @4 @5 @6)))
+  (with { tree op_type = TREE_TYPE (@6); }
+   (if (inverse_conditions_p (@0, @2)
+        && element_precision (type) == element_precision (op_type))
+    (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1)))))))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 8291e3d72f0..b101f230e54 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,14 @@
+2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>
+
+	* gcc.dg/vect/vect-cond-arith-4.c: New test.
+	* gcc.dg/vect/vect-cond-arith-5.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_1.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_1_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_2.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_2_run.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_3.c: Likewise.
+	* gcc.target/aarch64/sve/cond_arith_3_run.c: Likewise.
+
 2018-07-12  Richard Sandiford  <richard.sandiford@linaro.org>
 	    Alan Hayward  <alan.hayward@arm.com>
 	    David Sherwood  <david.sherwood@arm.com>
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-arith-4.c b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-4.c
new file mode 100644
index 00000000000..55a174a7ec1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-4.c
@@ -0,0 +1,58 @@
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 11 / 64 + 3)
+
+#define add(A, B) ((A) + (B))
+#define sub(A, B) ((A) - (B))
+#define mul(A, B) ((A) * (B))
+#define div(A, B) ((A) / (B))
+
+#define DEF(OP)							\
+  void __attribute__ ((noipa))					\
+  f_##OP (double *restrict a, double *restrict b, double x)	\
+  {								\
+    for (int i = 0; i < N; ++i)					\
+      a[i] = b[i] < 100 ? OP (b[i], x) : b[i];			\
+  }
+
+#define TEST(OP)					\
+  {							\
+    f_##OP (a, b, 10);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	int bval = (i % 17) * 10;			\
+	int truev = OP (bval, 10);			\
+	if (a[i] != (bval < 100 ? truev : bval))	\
+	__builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+#define FOR_EACH_OP(T)				\
+  T (add)					\
+  T (sub)					\
+  T (mul)					\
+  T (div)
+
+FOR_EACH_OP (DEF)
+
+int
+main (void)
+{
+  double a[N], b[N];
+  for (int i = 0; i < N; ++i)
+    {
+      b[i] = (i % 17) * 10;
+      asm volatile ("" ::: "memory");
+    }
+  FOR_EACH_OP (TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump { = \.COND_ADD} "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump { = \.COND_SUB} "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump { = \.COND_MUL} "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump { = \.COND_RDIV} "optimized" { target vect_double_cond_arith } } } */
+/* { dg-final { scan-tree-dump-not {VEC_COND_EXPR} "optimized" { target vect_double_cond_arith } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-cond-arith-5.c b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-5.c
new file mode 100644
index 00000000000..d2eadc4e945
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-cond-arith-5.c
@@ -0,0 +1,60 @@
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 11 / 64 + 3)
+
+#define add(A, B) ((A) + (B))
+#define sub(A, B) ((A) - (B))
+#define mul(A, B) ((A) * (B))
+#define div(A, B) ((A) / (B))
+
+#define DEF(OP)							\
+  void __attribute__ ((noipa))					\
+  f_##OP (double *restrict a, double *restrict b, double x)	\
+  {								\
+    for (int i = 0; i < N; ++i)					\
+      if (b[i] < 100)						\
+	a[i] = OP (b[i], x);					\
+  }
+
+#define TEST(OP)					\
+  {							\
+    f_##OP (a, b, 10);					\
+    for (int i = 0; i < N; ++i)				\
+      {							\
+	int bval = (i % 17) * 10;			\
+	int truev = OP (bval, 10);			\
+	if (a[i] != (bval < 100 ? truev : i * 3))	\
+	  __builtin_abort ();				\
+	asm volatile ("" ::: "memory");			\
+      }							\
+  }
+
+#define FOR_EACH_OP(T)				\
+  T (add)					\
+  T (sub)					\
+  T (mul)					\
+  T (div)
+
+FOR_EACH_OP (DEF)
+
+int
+main (void)
+{
+  double a[N], b[N];
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = i * 3;
+      b[i] = (i % 17) * 10;
+      asm volatile ("" ::: "memory");
+    }
+  FOR_EACH_OP (TEST)
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump { = \.COND_ADD} "optimized" { target { vect_double_cond_arith && vect_masked_store } } } } */
+/* { dg-final { scan-tree-dump { = \.COND_SUB} "optimized" { target { vect_double_cond_arith && vect_masked_store } } } } */
+/* { dg-final { scan-tree-dump { = \.COND_MUL} "optimized" { target { vect_double_cond_arith && vect_masked_store } } } } */
+/* { dg-final { scan-tree-dump { = \.COND_RDIV} "optimized" { target { vect_double_cond_arith && vect_masked_store } } } } */
+/* { dg-final { scan-tree-dump-not {VEC_COND_EXPR} "optimized" { target { vect_double_cond_arith && vect_masked_store } } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
new file mode 100644
index 00000000000..52138d2b023
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1.c
@@ -0,0 +1,64 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];	\
+  }
+
+#define TEST_INT_TYPE(TYPE) \
+  TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+  TEST (TYPE, add, +) \
+  TEST (TYPE, sub, -) \
+  TEST (TYPE, mul, *) \
+  TEST (TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int8_t) \
+  TEST_INT_TYPE (uint8_t) \
+  TEST_INT_TYPE (int16_t) \
+  TEST_INT_TYPE (uint16_t) \
+  TEST_INT_TYPE (int32_t) \
+  TEST_INT_TYPE (uint32_t) \
+  TEST_INT_TYPE (int64_t) \
+  TEST_INT_TYPE (uint64_t) \
+  TEST_FP_TYPE (float) \
+  TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */		\
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */		\
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+   division is done directly in the narrow type, rather than being widened
+   to int first.  */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* We fail to optimize away the SEL for the int8_t and int16_t loops,
+   because the 32-bit result is converted before selection.  */
+/* { dg-final { scan-assembler-times {\tsel\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
new file mode 100644
index 00000000000..876f98f6ec2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_1_run.c
@@ -0,0 +1,33 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_1.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP)					\
+  {								\
+    TYPE x[N], y[N], z[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	z[i] = ((i + 2) % 3) * (i + 1);				\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, z, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i];	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c
new file mode 100644
index 00000000000..0474ab52d4a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint.h>
+
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP)				\
+  void __attribute__ ((noinline, noclone))				\
+  test_##DATA_TYPE##_##PRED_TYPE##_##NAME (DATA_TYPE *__restrict x,	\
+					   DATA_TYPE *__restrict y,	\
+					   DATA_TYPE *__restrict z,	\
+					   PRED_TYPE *__restrict pred,	\
+					   int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : y[i];			\
+  }
+
+#define TEST_INT_TYPE(DATA_TYPE, PRED_TYPE) \
+  TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_FP_TYPE(DATA_TYPE, PRED_TYPE) \
+  TEST (DATA_TYPE, PRED_TYPE, add, +) \
+  TEST (DATA_TYPE, PRED_TYPE, sub, -) \
+  TEST (DATA_TYPE, PRED_TYPE, mul, *) \
+  TEST (DATA_TYPE, PRED_TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int32_t, int8_t) \
+  TEST_INT_TYPE (uint32_t, int8_t) \
+  TEST_INT_TYPE (int32_t, int16_t) \
+  TEST_INT_TYPE (uint32_t, int16_t) \
+  TEST_INT_TYPE (int64_t, int8_t) \
+  TEST_INT_TYPE (uint64_t, int8_t) \
+  TEST_INT_TYPE (int64_t, int16_t) \
+  TEST_INT_TYPE (uint64_t, int16_t) \
+  TEST_INT_TYPE (int64_t, int32_t) \
+  TEST_INT_TYPE (uint64_t, int32_t) \
+  TEST_FP_TYPE (float, int8_t) \
+  TEST_FP_TYPE (float, int16_t) \
+  TEST_FP_TYPE (double, int8_t) \
+  TEST_FP_TYPE (double, int16_t) \
+  TEST_FP_TYPE (double, int32_t)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 6 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 14 } } */
+
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c
new file mode 100644
index 00000000000..1eac60552ee
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_2_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_2.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(DATA_TYPE, PRED_TYPE, NAME, OP)			\
+  {								\
+    DATA_TYPE x[N], y[N], z[N];					\
+    PRED_TYPE pred[N];						\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	y[i] = i * i;						\
+	z[i] = ((i + 2) % 3) * (i + 1);				\
+	pred[i] = i % 3;					\
+      }								\
+    test_##DATA_TYPE##_##PRED_TYPE##_##NAME (x, y, z, pred, N);	\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	DATA_TYPE expected = i % 3 != 1 ? y[i] OP z[i] : y[i];	\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
new file mode 100644
index 00000000000..94eb255c969
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3.c
@@ -0,0 +1,62 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, NAME, OP)				\
+  void __attribute__ ((noinline, noclone))		\
+  test_##TYPE##_##NAME (TYPE *__restrict x,		\
+			TYPE *__restrict y,		\
+			TYPE *__restrict z,		\
+			TYPE *__restrict pred, int n)	\
+  {							\
+    for (int i = 0; i < n; ++i)				\
+      x[i] = pred[i] != 1 ? y[i] OP z[i] : 1;		\
+  }
+
+#define TEST_INT_TYPE(TYPE) \
+  TEST (TYPE, div, /)
+
+#define TEST_FP_TYPE(TYPE) \
+  TEST (TYPE, add, +) \
+  TEST (TYPE, sub, -) \
+  TEST (TYPE, mul, *) \
+  TEST (TYPE, div, /)
+
+#define TEST_ALL \
+  TEST_INT_TYPE (int8_t) \
+  TEST_INT_TYPE (uint8_t) \
+  TEST_INT_TYPE (int16_t) \
+  TEST_INT_TYPE (uint16_t) \
+  TEST_INT_TYPE (int32_t) \
+  TEST_INT_TYPE (uint32_t) \
+  TEST_INT_TYPE (int64_t) \
+  TEST_INT_TYPE (uint64_t) \
+  TEST_FP_TYPE (float) \
+  TEST_FP_TYPE (double)
+
+TEST_ALL
+
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.b} } } */		\
+/* { dg-final { scan-assembler-not {\t.div\tz[0-9]+\.h} } } */		\
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m,} 7 } } */
+/* At present we don't vectorize the uint8_t or uint16_t loops because the
+   division is done directly in the narrow type, rather than being widened
+   to int first.  */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadd\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfsub\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmul\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.s, p[0-7]/m,} 1 } } */
+/* { dg-final { scan-assembler-times {\tfdiv\tz[0-9]+\.d, p[0-7]/m,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tsel\t} 14 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
new file mode 100644
index 00000000000..31457da523b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_3_run.c
@@ -0,0 +1,34 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "cond_arith_3.c"
+
+#define N 99
+
+#undef TEST
+#define TEST(TYPE, NAME, OP)					\
+  {								\
+    TYPE x[N], y[N], z[N], pred[N];				\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	x[i] = -1;						\
+	y[i] = i * i;						\
+	z[i] = ((i + 2) % 3) * (i + 1);				\
+	pred[i] = i % 3;					\
+      }								\
+    test_##TYPE##_##NAME (x, y, z, pred, N);			\
+    for (int i = 0; i < N; ++i)					\
+      {								\
+	TYPE expected = i % 3 != 1 ? y[i] OP z[i] : 1;		\
+	if (x[i] != expected)					\
+	  __builtin_abort ();					\
+	asm volatile ("" ::: "memory");				\
+      }								\
+  }
+
+int
+main (void)
+{
+  TEST_ALL
+  return 0;
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index e9eaa11786a..e181468fba9 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -116,15 +116,19 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "params.h"
 #include "cfganal.h"
+#include "internal-fn.h"
+#include "fold-const.h"
 
 /* Only handle PHIs with no more arguments unless we are asked to by
    simd pragma.  */
 #define MAX_PHI_ARG_NUM \
   ((unsigned) PARAM_VALUE (PARAM_MAX_TREE_IF_CONVERSION_PHI_ARGS))
 
-/* Indicate if new load/store that needs to be predicated is introduced
-   during if conversion.  */
-static bool any_pred_load_store;
+/* True if we've converted a statement that was only executed when some
+   condition C was true, and if for correctness we need to predicate the
+   statement to ensure that it is a no-op when C is false.  See
+   predicate_statements for the kinds of predication we support.  */
+static bool need_to_predicate;
 
 /* Indicate if there are any complicated PHIs that need to be handled in
    if-conversion.  Complicated PHI has more than two arguments and can't
@@ -193,6 +197,9 @@ static hash_map<innermost_loop_behavior_hash,
 /* Hash table to store <base reference, DR> pairs.  */
 static hash_map<tree_operand_hash, data_reference_p> *baseref_DR_map;
 
+/* List of redundant SSA names: the first should be replaced by the second.  */
+static vec< std::pair<tree, tree> > redundant_ssa_names;
+
 /* Structure used to predicate basic blocks.  This is attached to the
    ->aux field of the BBs in the loop to be if-converted.  */
 struct bb_predicate {
@@ -919,19 +926,10 @@ ifcvt_memrefs_wont_trap (gimple *stmt, vec<data_reference_p> drs)
 static bool
 ifcvt_can_use_mask_load_store (gimple *stmt)
 {
-  tree lhs, ref;
-  machine_mode mode;
-  basic_block bb = gimple_bb (stmt);
-  bool is_load;
-
-  if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
-      || bb->loop_father->dont_vectorize
-      || !gimple_assign_single_p (stmt)
-      || gimple_has_volatile_ops (stmt))
-    return false;
-
   /* Check whether this is a load or store.  */
-  lhs = gimple_assign_lhs (stmt);
+  tree lhs = gimple_assign_lhs (stmt);
+  bool is_load;
+  tree ref;
   if (gimple_store_p (stmt))
     {
       if (!is_gimple_val (gimple_assign_rhs1 (stmt)))
@@ -952,7 +950,7 @@ ifcvt_can_use_mask_load_store (gimple *stmt)
 
   /* Mask should be integer mode of the same size as the load/store
      mode.  */
-  mode = TYPE_MODE (TREE_TYPE (lhs));
+  machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
   if (!int_mode_for_mode (mode).exists () || VECTOR_MODE_P (mode))
     return false;
 
@@ -962,6 +960,32 @@ ifcvt_can_use_mask_load_store (gimple *stmt)
   return false;
 }
 
+/* Return true if STMT could be converted from an operation that is
+   unconditional to one that is conditional on a bb predicate mask.  */
+
+static bool
+ifcvt_can_predicate (gimple *stmt)
+{
+  basic_block bb = gimple_bb (stmt);
+
+  if (!(flag_tree_loop_vectorize || bb->loop_father->force_vectorize)
+      || bb->loop_father->dont_vectorize
+      || gimple_has_volatile_ops (stmt))
+    return false;
+
+  if (gimple_assign_single_p (stmt))
+    return ifcvt_can_use_mask_load_store (stmt);
+
+  tree_code code = gimple_assign_rhs_code (stmt);
+  tree lhs_type = TREE_TYPE (gimple_assign_lhs (stmt));
+  tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
+  if (!types_compatible_p (lhs_type, rhs_type))
+    return false;
+  internal_fn cond_fn = get_conditional_internal_fn (code);
+  return (cond_fn != IFN_LAST
+	  && vectorized_internal_fn_supported_p (cond_fn, lhs_type));
+}
+
 /* Return true when STMT is if-convertible.
 
    GIMPLE_ASSIGN statement is not if-convertible if,
@@ -1006,10 +1030,10 @@ if_convertible_gimple_assign_stmt_p (gimple *stmt,
        || ! ifcvt_memrefs_wont_trap (stmt, refs))
       && gimple_could_trap_p (stmt))
     {
-      if (ifcvt_can_use_mask_load_store (stmt))
+      if (ifcvt_can_predicate (stmt))
 	{
 	  gimple_set_plf (stmt, GF_PLF_2, true);
-	  any_pred_load_store = true;
+	  need_to_predicate = true;
 	  return true;
 	}
       if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1020,7 +1044,7 @@ if_convertible_gimple_assign_stmt_p (gimple *stmt,
   /* When if-converting stores force versioning, likewise if we
      ended up generating store data races.  */
   if (gimple_vdef (stmt))
-    any_pred_load_store = true;
+    need_to_predicate = true;
 
   return true;
 }
@@ -2052,7 +2076,7 @@ insert_gimplified_predicates (loop_p loop)
       stmts = bb_predicate_gimplified_stmts (bb);
       if (stmts)
 	{
-	  if (any_pred_load_store)
+	  if (need_to_predicate)
 	    {
 	      /* Insert the predicate of the BB just after the label,
 		 as the if-conversion of memory writes will use this
@@ -2080,7 +2104,7 @@ insert_gimplified_predicates (loop_p loop)
     }
 }
 
-/* Helper function for predicate_mem_writes. Returns index of existent
+/* Helper function for predicate_statements. Returns index of existent
    mask if it was created for given SIZE and -1 otherwise.  */
 
 static int
@@ -2094,6 +2118,160 @@ mask_exists (int size, vec<int> vec)
   return -1;
 }
 
+/* Helper function for predicate_statements.  STMT is a memory read or
+   write and it needs to be predicated by MASK.  Return a statement
+   that does so.  */
+
+static gimple *
+predicate_load_or_store (gimple_stmt_iterator *gsi, gassign *stmt, tree mask)
+{
+  gcall *new_stmt;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  tree ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
+  mark_addressable (ref);
+  tree addr = force_gimple_operand_gsi (gsi, build_fold_addr_expr (ref),
+					true, NULL_TREE, true, GSI_SAME_STMT);
+  tree ptr = build_int_cst (reference_alias_ptr_type (ref),
+			    get_object_alignment (ref));
+  /* Copy points-to info if possible.  */
+  if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
+    copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
+		   ref);
+  if (TREE_CODE (lhs) == SSA_NAME)
+    {
+      new_stmt
+	= gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
+				      ptr, mask);
+      gimple_call_set_lhs (new_stmt, lhs);
+      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+    }
+  else
+    {
+      new_stmt
+	= gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+				      mask, rhs);
+      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
+      gimple_set_vdef (new_stmt, gimple_vdef (stmt));
+      SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
+    }
+  gimple_call_set_nothrow (new_stmt, true);
+  return new_stmt;
+}
+
+/* STMT uses OP_LHS.  Check whether it is equivalent to:
+
+     ... = OP_MASK ? OP_LHS : X;
+
+   Return X if so, otherwise return null.  OP_MASK is an SSA_NAME that is
+   known to have value OP_COND.  */
+
+static tree
+check_redundant_cond_expr (gimple *stmt, tree op_mask, tree op_cond,
+			   tree op_lhs)
+{
+  gassign *assign = dyn_cast <gassign *> (stmt);
+  if (!assign || gimple_assign_rhs_code (assign) != COND_EXPR)
+    return NULL_TREE;
+
+  tree use_cond = gimple_assign_rhs1 (assign);
+  tree if_true = gimple_assign_rhs2 (assign);
+  tree if_false = gimple_assign_rhs3 (assign);
+
+  if ((use_cond == op_mask || operand_equal_p (use_cond, op_cond, 0))
+      && if_true == op_lhs)
+    return if_false;
+
+  if (inverse_conditions_p (use_cond, op_cond) && if_false == op_lhs)
+    return if_true;
+
+  return NULL_TREE;
+}
+
+/* Return true if VALUE is available for use at STMT.  SSA_NAMES is
+   the set of SSA names defined earlier in STMT's block.  */
+
+static bool
+value_available_p (gimple *stmt, hash_set<tree_ssa_name_hash> *ssa_names,
+		   tree value)
+{
+  if (is_gimple_min_invariant (value))
+    return true;
+
+  if (TREE_CODE (value) == SSA_NAME)
+    {
+      if (SSA_NAME_IS_DEFAULT_DEF (value))
+	return true;
+
+      basic_block def_bb = gimple_bb (SSA_NAME_DEF_STMT (value));
+      basic_block use_bb = gimple_bb (stmt);
+      return (def_bb == use_bb
+	      ? ssa_names->contains (value)
+	      : dominated_by_p (CDI_DOMINATORS, use_bb, def_bb));
+    }
+
+  return false;
+}
+
+/* Helper function for predicate_statements.  STMT is a potentially-trapping
+   arithmetic operation that needs to be predicated by MASK, an SSA_NAME that
+   has value COND.  Return a statement that does so.  SSA_NAMES is the set of
+   SSA names defined earlier in STMT's block.  */
+
+static gimple *
+predicate_rhs_code (gassign *stmt, tree mask, tree cond,
+		    hash_set<tree_ssa_name_hash> *ssa_names)
+{
+  tree lhs = gimple_assign_lhs (stmt);
+  tree_code code = gimple_assign_rhs_code (stmt);
+  unsigned int nops = gimple_num_ops (stmt);
+  internal_fn cond_fn = get_conditional_internal_fn (code);
+
+  /* Construct the arguments to the conditional internal function.   */
+  auto_vec<tree, 8> args;
+  args.safe_grow (nops + 1);
+  args[0] = mask;
+  for (unsigned int i = 1; i < nops; ++i)
+    args[i] = gimple_op (stmt, i);
+  args[nops] = NULL_TREE;
+
+  /* Look for uses of the result to see whether they are COND_EXPRs that can
+     be folded into the conditional call.  */
+  imm_use_iterator imm_iter;
+  gimple *use_stmt;
+  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+    {
+      tree new_else = check_redundant_cond_expr (use_stmt, mask, cond, lhs);
+      if (new_else && value_available_p (stmt, ssa_names, new_else))
+	{
+	  if (!args[nops])
+	    args[nops] = new_else;
+	  if (operand_equal_p (new_else, args[nops], 0))
+	    {
+	      /* We have:
+
+		   LHS = IFN_COND (MASK, ..., ELSE);
+		   X = MASK ? LHS : ELSE;
+
+		 which makes X equivalent to LHS.  */
+	      tree use_lhs = gimple_assign_lhs (use_stmt);
+	      redundant_ssa_names.safe_push (std::make_pair (use_lhs, lhs));
+	    }
+	}
+    }
+  if (!args[nops])
+    args[nops] = targetm.preferred_else_value (cond_fn, TREE_TYPE (lhs),
+					       nops - 1, &args[1]);
+
+  /* Create and insert the call.  */
+  gcall *new_stmt = gimple_build_call_internal_vec (cond_fn, args);
+  gimple_call_set_lhs (new_stmt, lhs);
+  gimple_call_set_nothrow (new_stmt, true);
+
+  return new_stmt;
+}
+
 /* Predicate each write to memory in LOOP.
 
    This function transforms control flow constructs containing memory
@@ -2158,7 +2336,7 @@ mask_exists (int size, vec<int> vec)
    |   goto bb_1
    | end_bb_4
 
-   predicate_mem_writes is then predicating the memory write as follows:
+   predicate_statements is then predicating the memory write as follows:
 
    | bb_0
    |   i = 0
@@ -2202,11 +2380,12 @@ mask_exists (int size, vec<int> vec)
 */
 
 static void
-predicate_mem_writes (loop_p loop)
+predicate_statements (loop_p loop)
 {
   unsigned int i, orig_loop_num_nodes = loop->num_nodes;
   auto_vec<int, 1> vect_sizes;
   auto_vec<tree, 1> vect_masks;
+  hash_set<tree_ssa_name_hash> ssa_names;
 
   for (i = 1; i < orig_loop_num_nodes; i++)
     {
@@ -2214,7 +2393,6 @@ predicate_mem_writes (loop_p loop)
       basic_block bb = ifc_bbs[i];
       tree cond = bb_predicate (bb);
       bool swap;
-      gimple *stmt;
       int index;
 
       if (is_true_predicate (cond))
@@ -2232,7 +2410,8 @@ predicate_mem_writes (loop_p loop)
 
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
 	{
-	  if (!gimple_assign_single_p (stmt = gsi_stmt (gsi)))
+	  gassign *stmt = dyn_cast <gassign *> (gsi_stmt (gsi));
+	  if (!stmt)
 	    ;
 	  else if (is_false_predicate (cond)
 		   && gimple_vdef (stmt))
@@ -2245,19 +2424,13 @@ predicate_mem_writes (loop_p loop)
 	  else if (gimple_plf (stmt, GF_PLF_2))
 	    {
 	      tree lhs = gimple_assign_lhs (stmt);
-	      tree rhs = gimple_assign_rhs1 (stmt);
-	      tree ref, addr, ptr, mask;
-	      gcall *new_stmt;
+	      tree mask;
+	      gimple *new_stmt;
 	      gimple_seq stmts = NULL;
 	      machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
 	      /* We checked before setting GF_PLF_2 that an equivalent
 		 integer mode exists.  */
 	      int bitsize = GET_MODE_BITSIZE (mode).to_constant ();
-	      ref = TREE_CODE (lhs) == SSA_NAME ? rhs : lhs;
-	      mark_addressable (ref);
-	      addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
-					       true, NULL_TREE, true,
-					       GSI_SAME_STMT);
 	      if (!vect_sizes.is_empty ()
 		  && (index = mask_exists (bitsize, vect_sizes)) != -1)
 		/* Use created mask.  */
@@ -2285,30 +2458,10 @@ predicate_mem_writes (loop_p loop)
 		  vect_sizes.safe_push (bitsize);
 		  vect_masks.safe_push (mask);
 		}
-	      ptr = build_int_cst (reference_alias_ptr_type (ref),
-				   get_object_alignment (ref));
-	      /* Copy points-to info if possible.  */
-	      if (TREE_CODE (addr) == SSA_NAME && !SSA_NAME_PTR_INFO (addr))
-		copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr),
-			       ref);
-	      if (TREE_CODE (lhs) == SSA_NAME)
-		{
-		  new_stmt
-		    = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
-						  ptr, mask);
-		  gimple_call_set_lhs (new_stmt, lhs);
-		  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-		}
+	      if (gimple_assign_single_p (stmt))
+		new_stmt = predicate_load_or_store (&gsi, stmt, mask);
 	      else
-		{
-		  new_stmt
-		    = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
-						  mask, rhs);
-		  gimple_set_vuse (new_stmt, gimple_vuse (stmt));
-		  gimple_set_vdef (new_stmt, gimple_vdef (stmt));
-		  SSA_NAME_DEF_STMT (gimple_vdef (new_stmt)) = new_stmt;
-		}
-	      gimple_call_set_nothrow (new_stmt, true);
+		new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
 
 	      gsi_replace (&gsi, new_stmt, true);
 	    }
@@ -2329,8 +2482,12 @@ predicate_mem_writes (loop_p loop)
 	      gimple_assign_set_rhs1 (stmt, ifc_temp_var (type, rhs, &gsi));
 	      update_stmt (stmt);
 	    }
+	  tree lhs = gimple_get_lhs (gsi_stmt (gsi));
+	  if (lhs && TREE_CODE (lhs) == SSA_NAME)
+	    ssa_names.add (lhs);
 	  gsi_next (&gsi);
 	}
+      ssa_names.empty ();
     }
 }
 
@@ -2392,8 +2549,8 @@ combine_blocks (struct loop *loop)
   insert_gimplified_predicates (loop);
   predicate_all_scalar_phis (loop);
 
-  if (any_pred_load_store)
-    predicate_mem_writes (loop);
+  if (need_to_predicate)
+    predicate_statements (loop);
 
   /* Merge basic blocks: first remove all the edges in the loop,
      except for those from the exit block.  */
@@ -2733,6 +2890,12 @@ ifcvt_local_dce (basic_block bb)
   enum gimple_code code;
   use_operand_p use_p;
   imm_use_iterator imm_iter;
+  std::pair <tree, tree> *name_pair;
+  unsigned int i;
+
+  FOR_EACH_VEC_ELT (redundant_ssa_names, i, name_pair)
+    replace_uses_by (name_pair->first, name_pair->second);
+  redundant_ssa_names.release ();
 
   worklist.create (64);
   /* Consider all phi as live statements.  */
@@ -2833,7 +2996,7 @@ tree_if_conversion (struct loop *loop)
  again:
   rloop = NULL;
   ifc_bbs = NULL;
-  any_pred_load_store = false;
+  need_to_predicate = false;
   any_complicated_phi = false;
 
   /* Apply more aggressive if-conversion when loop or its outer loop were
@@ -2854,7 +3017,7 @@ tree_if_conversion (struct loop *loop)
       || !dbg_cnt (if_conversion_tree))
     goto cleanup;
 
-  if ((any_pred_load_store || any_complicated_phi)
+  if ((need_to_predicate || any_complicated_phi)
       && ((!flag_tree_loop_vectorize && !loop->force_vectorize)
 	  || loop->dont_vectorize))
     goto cleanup;
@@ -2864,7 +3027,7 @@ tree_if_conversion (struct loop *loop)
      Either version this loop, or if the pattern is right for outer-loop
      vectorization, version the outer loop.  In the latter case we will
      still if-convert the original inner loop.  */
-  if (any_pred_load_store
+  if (need_to_predicate
       || any_complicated_phi
       || flag_tree_loop_if_convert != 1)
     {
diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c
index 8e289508357..4c22afd2b5f 100644
--- a/gcc/tree-vect-patterns.c
+++ b/gcc/tree-vect-patterns.c
@@ -3905,65 +3905,68 @@ vect_recog_mask_conversion_pattern (stmt_vec_info stmt_vinfo, tree *type_out)
 
   /* Check for MASK_LOAD ans MASK_STORE calls requiring mask conversion.  */
   if (is_gimple_call (last_stmt)
-      && gimple_call_internal_p (last_stmt)
-      && (gimple_call_internal_fn (last_stmt) == IFN_MASK_STORE
-	  || gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD))
+      && gimple_call_internal_p (last_stmt))
     {
       gcall *pattern_stmt;
-      bool load = (gimple_call_internal_fn (last_stmt) == IFN_MASK_LOAD);
 
-      if (load)
+      internal_fn ifn = gimple_call_internal_fn (last_stmt);
+      int mask_argno = internal_fn_mask_index (ifn);
+      if (mask_argno < 0)
+	return NULL;
+
+      bool store_p = internal_store_fn_p (ifn);
+      if (store_p)
 	{
-	  lhs = gimple_call_lhs (last_stmt);
-	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
+	  int rhs_index = internal_fn_stored_value_index (ifn);
+	  tree rhs = gimple_call_arg (last_stmt, rhs_index);
+	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs));
 	}
       else
 	{
-	  rhs2 = gimple_call_arg (last_stmt, 3);
-	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (rhs2));
+	  lhs = gimple_call_lhs (last_stmt);
+	  vectype1 = get_vectype_for_scalar_type (TREE_TYPE (lhs));
 	}
 
-      rhs1 = gimple_call_arg (last_stmt, 2);
-      rhs1_type = search_type_for_mask (rhs1, vinfo);
-      if (!rhs1_type)
+      tree mask_arg = gimple_call_arg (last_stmt, mask_argno);
+      tree mask_arg_type = search_type_for_mask (mask_arg, vinfo);
+      if (!mask_arg_type)
 	return NULL;
-      vectype2 = get_mask_type_for_scalar_type (rhs1_type);
+      vectype2 = get_mask_type_for_scalar_type (mask_arg_type);
 
       if (!vectype1 || !vectype2
 	  || known_eq (TYPE_VECTOR_SUBPARTS (vectype1),
 		       TYPE_VECTOR_SUBPARTS (vectype2)))
 	return NULL;
 
-      tmp = build_mask_conversion (rhs1, vectype1, stmt_vinfo);
+      tmp = build_mask_conversion (mask_arg, vectype1, stmt_vinfo);
 
-      if (load)
+      auto_vec<tree, 8> args;
+      unsigned int nargs = gimple_call_num_args (last_stmt);
+      args.safe_grow (nargs);
+      for (unsigned int i = 0; i < nargs; ++i)
+	args[i] = ((int) i == mask_argno
+		   ? tmp
+		   : gimple_call_arg (last_stmt, i));
+      pattern_stmt = gimple_build_call_internal_vec (ifn, args);
+
+      if (!store_p)
 	{
 	  lhs = vect_recog_temp_ssa_var (TREE_TYPE (lhs), NULL);
-	  pattern_stmt
-	    = gimple_build_call_internal (IFN_MASK_LOAD, 3,
-					  gimple_call_arg (last_stmt, 0),
-					  gimple_call_arg (last_stmt, 1),
-					  tmp);
 	  gimple_call_set_lhs (pattern_stmt, lhs);
 	}
-      else
-	  pattern_stmt
-	    = gimple_build_call_internal (IFN_MASK_STORE, 4,
-					  gimple_call_arg (last_stmt, 0),
-					  gimple_call_arg (last_stmt, 1),
-					  tmp,
-					  gimple_call_arg (last_stmt, 3));
-
       gimple_call_set_nothrow (pattern_stmt, true);
 
       pattern_stmt_info = new_stmt_vec_info (pattern_stmt, vinfo);
       set_vinfo_for_stmt (pattern_stmt, pattern_stmt_info);
-      STMT_VINFO_DATA_REF (pattern_stmt_info)
-	= STMT_VINFO_DATA_REF (stmt_vinfo);
-      STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
-	= STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
-      STMT_VINFO_GATHER_SCATTER_P (pattern_stmt_info)
-	= STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo);
+      if (STMT_VINFO_DATA_REF (stmt_vinfo))
+	{
+	  STMT_VINFO_DATA_REF (pattern_stmt_info)
+	    = STMT_VINFO_DATA_REF (stmt_vinfo);
+	  STMT_VINFO_DR_WRT_VEC_LOOP (pattern_stmt_info)
+	    = STMT_VINFO_DR_WRT_VEC_LOOP (stmt_vinfo);
+	  STMT_VINFO_GATHER_SCATTER_P (pattern_stmt_info)
+	    = STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo);
+	}
 
       *type_out = vectype1;
       vect_pattern_detected ("vect_recog_mask_conversion_pattern", last_stmt);
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 73b81e1c2dd..1c847ae016d 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -3126,12 +3126,14 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
   vec_info *vinfo = stmt_info->vinfo;
   tree fndecl, new_temp, rhs_type;
-  enum vect_def_type dt[3]
-    = {vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type};
-  int ndts = 3;
+  enum vect_def_type dt[4]
+    = { vect_unknown_def_type, vect_unknown_def_type, vect_unknown_def_type,
+	vect_unknown_def_type };
+  int ndts = ARRAY_SIZE (dt);
   gimple *new_stmt = NULL;
   int ncopies, j;
-  vec<tree> vargs = vNULL;
+  auto_vec<tree, 8> vargs;
+  auto_vec<tree, 8> orig_vargs;
   enum { NARROW, NONE, WIDEN } modifier;
   size_t i, nargs;
   tree lhs;
@@ -3170,22 +3172,38 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   /* Bail out if the function has more than three arguments, we do not have
      interesting builtin functions to vectorize with more than two arguments
      except for fma.  No arguments is also not good.  */
-  if (nargs == 0 || nargs > 3)
+  if (nargs == 0 || nargs > 4)
     return false;
 
   /* Ignore the argument of IFN_GOMP_SIMD_LANE, it is magic.  */
-  if (gimple_call_internal_p (stmt)
-      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+  combined_fn cfn = gimple_call_combined_fn (stmt);
+  if (cfn == CFN_GOMP_SIMD_LANE)
     {
       nargs = 0;
       rhs_type = unsigned_type_node;
     }
 
+  int mask_opno = -1;
+  if (internal_fn_p (cfn))
+    mask_opno = internal_fn_mask_index (as_internal_fn (cfn));
+
   for (i = 0; i < nargs; i++)
     {
       tree opvectype;
 
       op = gimple_call_arg (stmt, i);
+      if (!vect_is_simple_use (op, vinfo, &dt[i], &opvectype))
+	{
+	  if (dump_enabled_p ())
+	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+			     "use not simple.\n");
+	  return false;
+	}
+
+      /* Skip the mask argument to an internal function.  This operand
+	 has been converted via a pattern if necessary.  */
+      if ((int) i == mask_opno)
+	continue;
 
       /* We can only handle calls with arguments of the same type.  */
       if (rhs_type
@@ -3199,14 +3217,6 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (!rhs_type)
 	rhs_type = TREE_TYPE (op);
 
-      if (!vect_is_simple_use (op, vinfo, &dt[i], &opvectype))
-	{
-	  if (dump_enabled_p ())
-	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                             "use not simple.\n");
-	  return false;
-	}
-
       if (!vectype_in)
 	vectype_in = opvectype;
       else if (opvectype
@@ -3264,7 +3274,6 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      to vectorize other operations in the loop.  */
   fndecl = NULL_TREE;
   internal_fn ifn = IFN_LAST;
-  combined_fn cfn = gimple_call_combined_fn (stmt);
   tree callee = gimple_call_fndecl (stmt);
 
   /* First try using an internal function.  */
@@ -3328,6 +3337,7 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
      needs to be generated.  */
   gcc_assert (ncopies >= 1);
 
+  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
@@ -3337,6 +3347,13 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	record_stmt_cost (cost_vec, ncopies / 2,
 			  vec_promote_demote, stmt_info, 0, vect_body);
 
+      if (loop_vinfo && mask_opno >= 0)
+	{
+	  unsigned int nvectors = (slp_node
+				   ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
+				   : ncopies);
+	  vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+	}
       return true;
     }
 
@@ -3349,25 +3366,24 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
   scalar_dest = gimple_call_lhs (stmt);
   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
+  bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+
   prev_stmt_info = NULL;
   if (modifier == NONE || ifn != IFN_LAST)
     {
       tree prev_res = NULL_TREE;
+      vargs.safe_grow (nargs);
+      orig_vargs.safe_grow (nargs);
       for (j = 0; j < ncopies; ++j)
 	{
 	  /* Build argument list for the vectorized call.  */
-	  if (j == 0)
-	    vargs.create (nargs);
-	  else
-	    vargs.truncate (0);
-
 	  if (slp_node)
 	    {
 	      auto_vec<vec<tree> > vec_defs (nargs);
 	      vec<tree> vec_oprnds0;
 
 	      for (i = 0; i < nargs; i++)
-		vargs.quick_push (gimple_call_arg (stmt, i));
+		vargs[i] = gimple_call_arg (stmt, i);
 	      vect_get_slp_defs (vargs, slp_node, &vec_defs);
 	      vec_oprnds0 = vec_defs[0];
 
@@ -3382,6 +3398,9 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 		    }
 		  if (modifier == NARROW)
 		    {
+		      /* We don't define any narrowing conditional functions
+			 at present.  */
+		      gcc_assert (mask_opno < 0);
 		      tree half_res = make_ssa_name (vectype_in);
 		      gcall *call
 			= gimple_build_call_internal_vec (ifn, vargs);
@@ -3400,6 +3419,17 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 		    }
 		  else
 		    {
+		      if (mask_opno >= 0 && masked_loop_p)
+			{
+			  unsigned int vec_num = vec_oprnds0.length ();
+			  /* Always true for SLP.  */
+			  gcc_assert (ncopies == 1);
+			  tree mask = vect_get_loop_mask (gsi, masks, vec_num,
+							  vectype_out, i);
+			  vargs[mask_opno] = prepare_load_store_mask
+			    (TREE_TYPE (mask), mask, vargs[mask_opno], gsi);
+			}
+
 		      gcall *call;
 		      if (ifn != IFN_LAST)
 			call = gimple_build_call_internal_vec (ifn, vargs);
@@ -3429,17 +3459,22 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 		vec_oprnd0
 		  = vect_get_vec_def_for_operand (op, stmt);
 	      else
-		{
-		  vec_oprnd0 = gimple_call_arg (new_stmt, i);
-		  vec_oprnd0
-                    = vect_get_vec_def_for_stmt_copy (dt[i], vec_oprnd0);
-		}
+		vec_oprnd0
+		  = vect_get_vec_def_for_stmt_copy (dt[i], orig_vargs[i]);
 
-	      vargs.quick_push (vec_oprnd0);
+	      orig_vargs[i] = vargs[i] = vec_oprnd0;
+	    }
+
+	  if (mask_opno >= 0 && masked_loop_p)
+	    {
+	      tree mask = vect_get_loop_mask (gsi, masks, ncopies,
+					      vectype_out, j);
+	      vargs[mask_opno]
+		= prepare_load_store_mask (TREE_TYPE (mask), mask,
+					   vargs[mask_opno], gsi);
 	    }
 
-	  if (gimple_call_internal_p (stmt)
-	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE)
+	  if (cfn == CFN_GOMP_SIMD_LANE)
 	    {
 	      tree cst = build_index_vector (vectype_out, j * nunits_out, 1);
 	      tree new_var
@@ -3451,6 +3486,9 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 	    }
 	  else if (modifier == NARROW)
 	    {
+	      /* We don't define any narrowing conditional functions at
+		 present.  */
+	      gcc_assert (mask_opno < 0);
 	      tree half_res = make_ssa_name (vectype_in);
 	      gcall *call = gimple_build_call_internal_vec (ifn, vargs);
 	      gimple_call_set_lhs (call, half_res);
@@ -3490,6 +3528,8 @@ vectorizable_call (gimple *gs, gimple_stmt_iterator *gsi, gimple **vec_stmt,
     }
   else if (modifier == NARROW)
     {
+      /* We don't define any narrowing conditional functions at present.  */
+      gcc_assert (mask_opno < 0);
       for (j = 0; j < ncopies; ++j)
 	{
 	  /* Build argument list for the vectorized call.  */