re PR target/86753 (gcc.target/aarch64/sve/vcond_[45].c fail after recent combine...
authorPrathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Fri, 18 Oct 2019 05:13:26 +0000 (05:13 +0000)
committerPrathamesh Kulkarni <prathamesh3492@gcc.gnu.org>
Fri, 18 Oct 2019 05:13:26 +0000 (05:13 +0000)
2019-10-18  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
    Richard Sandiford  <richard.sandiford@arm.com>

PR target/86753
* tree-vectorizer.h (scalar_cond_masked_key): New struct,
and define hashmap traits for it.
(loop_vec_info::scalar_cond_masked_set): New member.
(vect_record_loop_mask): Adjust prototype.
* tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
Implement method.
* tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to
vect_record_loop_mask.
(vectorizable_live_operation): Likewise.
(vect_record_loop_mask): New param scalar_mask. Add entry
cond, loop_mask to scalar_cond_masked_set if scalar_mask is non NULL.
* tree-vect-stmts.c (check_load_store_masking): New param scalar_mask.
Pass it as last arg to vect_record_loop_mask.
(vectorizable_call): Pass scalar_mask as last arg to
vect_record_loop_mask.
(vectorizable_store): Likewise.
(vectorizable_load): Likewise.
(vectorizable_condition): Check if another part of vectorized code
applies loop_mask to condition or to it's inverse, and if yes,
apply loop_mask to result of vector comparison.

testsuite/
* gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL
from { scan-assembler-not {\tsel\t}.
* gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make
only one load conditional.
* gcc.target/aarch64/sve/cond_convert_4.c: Likewise.
* gcc.target/aarch64/sve/cond_unary_2.c: Likewise.
* gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's.
* gcc.target/aarch64/sve/vcond_5.c: Likewise.

Co-Authored-By: Richard Sandiford <richard.sandiford@arm.com>
From-SVN: r277141

12 files changed:
gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c
gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c
gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c
gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c
gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c
gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c
gcc/tree-vect-loop.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.c
gcc/tree-vectorizer.h

index b9a6c15087b031cb197acb3977e20497439dc69f..273d13c92c31a4013c38136789477bbf011edbc6 100644 (file)
@@ -1,3 +1,28 @@
+2019-10-18  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
+           Richard Sandiford  <richard.sandiford@arm.com>
+
+       PR target/86753
+       * tree-vectorizer.h (scalar_cond_masked_key): New struct,
+       and define hashmap traits for it.
+       (loop_vec_info::scalar_cond_masked_set): New member.
+       (vect_record_loop_mask): Adjust prototype.
+       * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
+       Implement method.
+       * tree-vect-loop.c (vectorizable_reduction): Pass NULL as last arg to
+       vect_record_loop_mask.
+       (vectorizable_live_operation): Likewise.
+       (vect_record_loop_mask): New param scalar_mask. Add entry
+       cond, loop_mask to scalar_cond_masked_set if scalar_mask is non NULL.
+       * tree-vect-stmts.c (check_load_store_masking): New param scalar_mask.
+       Pass it as last arg to vect_record_loop_mask.
+       (vectorizable_call): Pass scalar_mask as last arg to
+       vect_record_loop_mask.
+       (vectorizable_store): Likewise.
+       (vectorizable_load): Likewise.
+       (vectorizable_condition): Check if another part of vectorized code
+       applies loop_mask to condition or to it's inverse, and if yes,
+       apply loop_mask to result of vector comparison.
+
 2019-10-17  John David Anglin  <danglin@gcc.gnu.org>
 
        * config/pa/pa.c (pa_output_indirect_call): Fix typos in last change.
index 2d2a2748249f82cdf47598c1f78a85d0e4aabddf..fc6345794682d1ec692af40f9776117ea9384f3f 100644 (file)
@@ -1,3 +1,16 @@
+2019-10-18  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>
+           Richard Sandiford  <richard.sandiford@arm.com>
+
+       PR target/86753
+       * gcc.target/aarch64/sve/cond_cnot_2.c: Remove XFAIL
+       from { scan-assembler-not {\tsel\t}.
+       * gcc.target/aarch64/sve/cond_convert_1.c: Adjust to make
+       only one load conditional.
+       * gcc.target/aarch64/sve/cond_convert_4.c: Likewise.
+       * gcc.target/aarch64/sve/cond_unary_2.c: Likewise.
+       * gcc.target/aarch64/sve/vcond_4.c: Remove XFAIL's.
+       * gcc.target/aarch64/sve/vcond_5.c: Likewise.
+
 2019-10-18  Jakub Jelinek  <jakub@redhat.com>
 
        PR tree-optimization/92056
index d689e21dc1143b4235138a1fd69814d090493245..3df2431be3816fa1af4954f41935dfe95d471cf1 100644 (file)
@@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP)
 /* { dg-final { scan-assembler-not {\tmov\tz} } } */
 /* { dg-final { scan-assembler-not {\tmovprfx\t} } } */
 /* Currently we canonicalize the ?: so that !b[i] is the "false" value.  */
-/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not {\tsel\t} } } */
index dcc30768f88ef33e456c69634add41174d4dc24b..86064ebfcba897e1a6ee77b43d93e3869b35cc55 100644 (file)
                   INT_TYPE *__restrict pred, int n)            \
   {                                                            \
     for (int i = 0; i < n; ++i)                                        \
-      r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i];               \
+      {                                                                \
+       FLOAT_TYPE bi = b[i];                                   \
+       r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi;                \
+      }                                                                \
   }
 
 #define TEST_ALL(T) \
index 7e5f2a73ed96a43bf5ca1c5a725f6f979fafac9d..e3a947b26983ee33d43a01be4219fed9ac527745 100644 (file)
                   INT_TYPE *__restrict pred, int n)            \
   {                                                            \
     for (int i = 0; i < n; ++i)                                        \
-      r[i] = pred[i] ? (INT_TYPE) a[i] : b[i];                 \
+      {                                                                \
+       INT_TYPE bi = b[i];                                     \
+       r[i] = pred[i] ? (INT_TYPE) a[i] : bi;                  \
+      }                                                                \
   }
 
 #define TEST_ALL(T) \
index 991ccf016d1e2233f3d12988d095404d8e9cabbf..97d1b8f5d4554d3a74b07c939ee2e5942b40d570 100644 (file)
                      TYPE *__restrict pred, int n)             \
   {                                                            \
     for (int i = 0; i < n; ++i)                                        \
-      r[i] = pred[i] ? OP (a[i]) : b[i];                       \
+      {                                                                \
+       TYPE bi = b[i];                                         \
+       r[i] = pred[i] ? OP (a[i]) : bi;                        \
+      }                                                                \
   }
 
 #define TEST_INT_TYPE(T, TYPE) \
index 00d84760a19729b0365e12323fb17c424ab73ddd..b38f23e87baa2251dd1a60732b5707527648406e 100644 (file)
@@ -98,24 +98,24 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
 
 /* 5 for lt, 5 for ult and 5 for nult.  */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for gt, 5 for ugt and 5 for nugt.  */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */
 /* 3 loops * 5 invocations for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */
@@ -123,19 +123,19 @@ TEST_CMP (nugt)
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
    for all 12 unordered comparisons.  */
-/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */
index 23bfb7b264982b6d4d2ac5483e9adec60cc85011..2f16fbff5225d35f18dceada85f1fabe2c7a2692 100644 (file)
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for le, 5 for ule and 5 for nule.  */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt.  */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */
 
 /* 5 for ge, 5 for uge and 5 for nuge.  */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */
 /* 3 loops * 5 invocations for ordered, unordered amd ueq.  */
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */
 
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */
+/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */
 
 /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */
 /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2,
index 80db6abe49021b6c0eb04df109148d7ce1f82393..10920acc9e01506fd495ebc09fbe6e2ef17c5b19 100644 (file)
@@ -6330,7 +6330,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        }
       else
        vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
-                              vectype_in);
+                              vectype_in, NULL);
     }
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
@@ -7561,7 +7561,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
              gcc_assert (ncopies == 1 && !slp_node);
              vect_record_loop_mask (loop_vinfo,
                                     &LOOP_VINFO_MASKS (loop_vinfo),
-                                    1, vectype);
+                                    1, vectype, NULL);
            }
        }
       return true;
@@ -7760,11 +7760,12 @@ vect_double_mask_nunits (tree type)
 
 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
    contain a sequence of NVECTORS masks that each control a vector of type
-   VECTYPE.  */
+   VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
+   these vector masks with the vector version of SCALAR_MASK.  */
 
 void
 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
-                      unsigned int nvectors, tree vectype)
+                      unsigned int nvectors, tree vectype, tree scalar_mask)
 {
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
@@ -7775,6 +7776,13 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   unsigned int nscalars_per_iter
     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
                 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (scalar_mask)
+    {
+      scalar_cond_masked_key cond (scalar_mask, nvectors);
+      loop_vinfo->scalar_cond_masked_set.add (cond);
+    }
+
   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
index e7255fb76bc12b9e43046d2891ddbd511412dbae..acdd90784dcf4d6d354c85da2766884835be98bc 100644 (file)
@@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info,
    says how the load or store is going to be implemented and GROUP_SIZE
    is the number of load or store statements in the containing group.
    If the access is a gather load or scatter store, GS_INFO describes
-   its arguments.
+   its arguments.  If the load or store is conditional, SCALAR_MASK is the
+   condition under which it occurs.
 
    Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not
    supported, otherwise record the required mask types.  */
@@ -1888,7 +1889,7 @@ static void
 check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
                          vec_load_store_type vls_type, int group_size,
                          vect_memory_access_type memory_access_type,
-                         gather_scatter_info *gs_info)
+                         gather_scatter_info *gs_info, tree scalar_mask)
 {
   /* Invariant loads need no special support.  */
   if (memory_access_type == VMAT_INVARIANT)
@@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
          return;
        }
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
          return;
        }
       unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype);
+      vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask);
       return;
     }
 
@@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned int nvectors;
   if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors))
-    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype);
+    vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask);
   else
     gcc_unreachable ();
 }
@@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
          unsigned int nvectors = (slp_node
                                   ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)
                                   : ncopies);
-         vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out);
+         tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno);
+         vect_record_loop_mask (loop_vinfo, masks, nvectors,
+                                vectype_out, scalar_mask);
        }
       return true;
     }
@@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
          && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
        check_load_store_masking (loop_vinfo, vectype, vls_type, group_size,
-                                 memory_access_type, &gs_info);
+                                 memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
       vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type,
@@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (loop_vinfo
          && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
        check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size,
-                                 memory_access_type, &gs_info);
+                                 memory_access_type, &gs_info, mask);
 
       STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
@@ -10007,6 +10010,35 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   /* Handle cond expr.  */
   for (j = 0; j < ncopies; j++)
     {
+      tree loop_mask = NULL_TREE;
+      bool swap_cond_operands = false;
+
+      /* See whether another part of the vectorized code applies a loop
+        mask to the condition, or to its inverse.  */
+
+      if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         scalar_cond_masked_key cond (cond_expr, ncopies);
+         if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+           {
+             vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+             loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j);
+           }
+         else
+           {
+             bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+             cond.code = invert_tree_comparison (cond.code, honor_nans);
+             if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+               {
+                 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+                 loop_mask = vect_get_loop_mask (gsi, masks, ncopies,
+                                                 vectype, j);
+                 cond_code = cond.code;
+                 swap_cond_operands = true;
+               }
+           }
+       }
+
       stmt_vec_info new_stmt_info = NULL;
       if (j == 0)
        {
@@ -10084,6 +10116,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
           vec_then_clause = vec_oprnds2[i];
           vec_else_clause = vec_oprnds3[i];
 
+         if (swap_cond_operands)
+           std::swap (vec_then_clause, vec_else_clause);
+
          if (masked)
            vec_compare = vec_cond_lhs;
          else
@@ -10122,6 +10157,50 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                    }
                }
            }
+
+         /* If we decided to apply a loop mask to the result of the vector
+             comparison, AND the comparison with the mask now.  Later passes
+             should then be able to reuse the AND results between mulitple
+             vector statements.
+
+            For example:
+            for (int i = 0; i < 100; ++i)
+              x[i] = y[i] ? z[i] : 10;
+
+            results in following optimized GIMPLE:
+
+            mask__35.8_43 = vect__4.7_41 != { 0, ... };
+            vec_mask_and_46 = loop_mask_40 & mask__35.8_43;
+            _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B];
+            vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46);
+            vect_iftmp.12_52 = VEC_COND_EXPR <vec_mask_and_46,
+                                              vect_iftmp.11_47, { 10, ... }>;
+
+            instead of using a masked and unmasked forms of
+            vec != { 0, ... } (masked in the MASK_LOAD,
+            unmasked in the VEC_COND_EXPR).  */
+
+         if (loop_mask)
+           {
+             if (COMPARISON_CLASS_P (vec_compare))
+               {
+                 tree tmp = make_ssa_name (vec_cmp_type);
+                 tree op0 = TREE_OPERAND (vec_compare, 0);
+                 tree op1 = TREE_OPERAND (vec_compare, 1);
+                 gassign *g = gimple_build_assign (tmp,
+                                                   TREE_CODE (vec_compare),
+                                                   op0, op1);
+                 vect_finish_stmt_generation (stmt_info, g, gsi);
+                 vec_compare = tmp;
+               }
+
+             tree tmp2 = make_ssa_name (vec_cmp_type);
+             gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR,
+                                               vec_compare, loop_mask);
+             vect_finish_stmt_generation (stmt_info, g, gsi);
+             vec_compare = tmp2;
+           }
+
          if (reduction_type == EXTRACT_LAST_REDUCTION)
            {
              if (!is_gimple_val (vec_compare))
index 512e2e001da748bf36a8a8e211f52d06862f107a..1a0cc93582b3fbb01d7797b15728f2abc70182f8 100644 (file)
@@ -1515,3 +1515,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt)
 {
   return new pass_ipa_increase_alignment (ctxt);
 }
+
+/* If the condition represented by T is a comparison or the SSA name
+   result of a comparison, extract the comparison's operands.  Represent
+   T as NE_EXPR <T, 0> otherwise.  */
+
+void
+scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
+{
+  if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison)
+    {
+      this->code = TREE_CODE (t);
+      this->op0 = TREE_OPERAND (t, 0);
+      this->op1 = TREE_OPERAND (t, 1);
+      return;
+    }
+
+  if (TREE_CODE (t) == SSA_NAME)
+    if (gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (t)))
+      {
+       tree_code code = gimple_assign_rhs_code (stmt);
+       if (TREE_CODE_CLASS (code) == tcc_comparison)
+         {
+           this->code = code;
+           this->op0 = gimple_assign_rhs1 (stmt);
+           this->op1 = gimple_assign_rhs2 (stmt);
+           return;
+         }
+      }
+
+  this->code = NE_EXPR;
+  this->op0 = t;
+  this->op1 = build_zero_cst (TREE_TYPE (t));
+}
index d59ba13bc4ec85e204c2c785267ad0ad659d18b8..5c3b3c9b91eecc2c48ef9985255335cf37841ce4 100644 (file)
@@ -177,7 +177,75 @@ public:
 #define SLP_TREE_TWO_OPERATORS(S)               (S)->two_operators
 #define SLP_TREE_DEF_TYPE(S)                    (S)->def_type
 
+/* Key for map that records association between
+   scalar conditions and corresponding loop mask, and
+   is populated by vect_record_loop_mask.  */
 
+struct scalar_cond_masked_key
+{
+  scalar_cond_masked_key (tree t, unsigned ncopies_)
+    : ncopies (ncopies_)
+  {
+    get_cond_ops_from_tree (t);
+  }
+
+  void get_cond_ops_from_tree (tree);
+
+  unsigned ncopies;
+  tree_code code;
+  tree op0;
+  tree op1;
+};
+
+template<>
+struct default_hash_traits<scalar_cond_masked_key>
+{
+  typedef scalar_cond_masked_key compare_type;
+  typedef scalar_cond_masked_key value_type;
+
+  static inline hashval_t
+  hash (value_type v)
+  {
+    inchash::hash h;
+    h.add_int (v.code);
+    inchash::add_expr (v.op0, h, 0);
+    inchash::add_expr (v.op1, h, 0);
+    h.add_int (v.ncopies);
+    return h.end ();
+  }
+
+  static inline bool
+  equal (value_type existing, value_type candidate)
+  {
+    return (existing.ncopies == candidate.ncopies
+           && existing.code == candidate.code
+           && operand_equal_p (existing.op0, candidate.op0, 0)
+           && operand_equal_p (existing.op1, candidate.op1, 0));
+  }
+
+  static inline void
+  mark_empty (value_type &v)
+  {
+    v.ncopies = 0;
+  }
+
+  static inline bool
+  is_empty (value_type v)
+  {
+    return v.ncopies == 0;
+  }
+
+  static inline void mark_deleted (value_type &) {}
+
+  static inline bool is_deleted (const value_type &)
+  {
+    return false;
+  }
+
+  static inline void remove (value_type &) {}
+};
+
+typedef hash_set<scalar_cond_masked_key> scalar_cond_masked_set_type;
 
 /* Describes two objects whose addresses must be unequal for the vectorized
    loop to be valid.  */
@@ -426,6 +494,9 @@ public:
      on inactive scalars.  */
   vec_loop_masks masks;
 
+  /* Set of scalar conditions that have loop mask applied.  */
+  scalar_cond_masked_set_type scalar_cond_masked_set;
+
   /* If we are using a loop mask to align memory addresses, this variable
      contains the number of vector elements that we should skip in the
      first iteration of the vector loop (i.e. the number of leading
@@ -1637,7 +1708,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *,
 extern tree vect_halve_mask_nunits (tree);
 extern tree vect_double_mask_nunits (tree);
 extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *,
-                                  unsigned int, tree);
+                                  unsigned int, tree, tree);
 extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *,
                                unsigned int, tree, unsigned int);
 extern stmt_vec_info info_for_reduction (stmt_vec_info);