From: Ilya Enkovich Date: Wed, 22 Jun 2016 14:05:55 +0000 (+0000) Subject: re PR tree-optimization/71488 (Wrong code for vector comparisons with ivybridge and... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=49e76ff114c94634f01ec0e2406b9ece7d58ce2d;p=gcc.git re PR tree-optimization/71488 (Wrong code for vector comparisons with ivybridge and westmere targets) gcc/ PR middle-end/71488 * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Support comparison of boolean vectors. * tree-vect-stmts.c (vectorizable_comparison): Vectorize comparison of boolean vectors using bitwise operations. gcc/testsuite/ PR middle-end/71488 * g++.dg/pr71488.C: New test. * gcc.dg/vect/vect-bool-cmp.c: New test. From-SVN: r237706 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e6e39121912..047f0594b84 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2016-06-22 Ilya Enkovich + + PR middle-end/71488 + * tree-vect-patterns.c (vect_recog_mask_conversion_pattern): Support + comparison of boolean vectors. + * tree-vect-stmts.c (vectorizable_comparison): Vectorize comparison + of boolean vectors using bitwise operations. + 2016-06-22 Andreas Schwab * config/aarch64/aarch64-protos.h (aarch64_elf_asm_named_section): diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 62fdac4b381..662a15ba189 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2016-06-22 Ilya Enkovich + + PR middle-end/71488 + * g++.dg/pr71488.C: New test. + * gcc.dg/vect/vect-bool-cmp.c: New test. + 2016-06-22 Eric Botcazou * gcc.dg/guality/param-5.c: New test. diff --git a/gcc/testsuite/g++.dg/pr71488.C b/gcc/testsuite/g++.dg/pr71488.C new file mode 100644 index 00000000000..d7d657ed06e --- /dev/null +++ b/gcc/testsuite/g++.dg/pr71488.C @@ -0,0 +1,24 @@ +// PR middle-end/71488 +// { dg-do run } +// { dg-options "-O3 -std=c++11" } +// { dg-additional-options "-march=westmere" { target i?86-*-* x86_64-*-* } } +// { dg-require-effective-target c++11 } + +#include + +int var_4 = 1; +long long var_9 = 0; + +int main() { + + std::valarray> v10; + + v10.resize(1); + v10[0].resize(4); + + for (int i = 0; i < 4; i++) + v10[0][i] = ((var_9 == 0) > unsigned (var_4 == 0)) + (var_9 == 0); + + if (v10[0][0] != 2) + __builtin_abort (); +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c new file mode 100644 index 00000000000..a1e2a24accd --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-bool-cmp.c @@ -0,0 +1,252 @@ +/* PR71488 */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_pack_trunc } */ +/* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */ + +int i1, i2; + +void __attribute__((noclone,noinline)) +fn1 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn2 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn3 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) > (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn4 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn5 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn6 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) >= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn7 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn8 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn9 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) < (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn10 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn11 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn12 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) <= (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn13 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn14 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn15 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) == (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn16 (int * __restrict__ p1, int * __restrict__ p2, int * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn17 (int * __restrict__ p1, int * __restrict__ p2, short * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +void __attribute__((noclone,noinline)) +fn18 (int * __restrict__ p1, int * __restrict__ p2, long long * __restrict__ p3, int size) +{ + int i; + + for (i = 0; i < size; i++) + p1[i] = ((p2[i] == 0) != (unsigned)(p3[i] == 0)) + (p2[i] == 0); +} + +int eq (int i1, int i2) { return i1 == i2; } +int ne (int i1, int i2) { return i1 != i2; } +int lt (int i1, int i2) { return i1 < i2; } +int le (int i1, int i2) { return i1 <= i2; } +int gt (int i1, int i2) { return i1 > i2; } +int ge (int i1, int i2) { return i1 >= i2; } + +typedef int (*cmp_fn)(int, int); + +void +check (int *p, cmp_fn fn) +{ + int i; + + for (i = 0; i < 32; i++) + { + int t1 = ((i % 4) > 1) == 0; + int t2 = (i % 2) == 0; + int res = fn (t1, t2) + t1; + if (p[i] != res) + __builtin_abort (); + } +} + +int +main (int argc, char **argv) +{ + int i1[32], i2[32], res[32]; + short s2[32]; + long long l2[32]; + int i; + + for (i = 0; i < 32; i++) + { + l2[i] = i2[i] = s2[i] = i % 2; + i1[i] = (i % 4) > 1; + asm ("":::"memory"); + } + + fn1 (res, i1, i2, 32); + check (res, gt); + fn2 (res, i1, s2, 32); + check (res, gt); + fn3 (res, i1, l2, 32); + check (res, gt); + + fn4 (res, i1, i2, 32); + check (res, ge); + fn5 (res, i1, s2, 32); + check (res, ge); + fn6 (res, i1, l2, 32); + check (res, ge); + + fn7 (res, i1, i2, 32); + check (res, lt); + fn8 (res, i1, s2, 32); + check (res, lt); + fn9 (res, i1, l2, 32); + check (res, lt); + + fn10 (res, i1, i2, 32); + check (res, le); + fn11 (res, i1, s2, 32); + check (res, le); + fn12 (res, i1, l2, 32); + check (res, le); + + fn13 (res, i1, i2, 32); + check (res, eq); + fn14 (res, i1, s2, 32); + check (res, eq); + fn15 (res, i1, l2, 32); + check (res, eq); + + fn16 (res, i1, i2, 32); + check (res, ne); + fn17 (res, i1, s2, 32); + check (res, ne); + fn18 (res, i1, l2, 32); + check (res, ne); +} + +/* { dg-final { scan-tree-dump-times "VECTORIZED" 18 "vect" { target { i?86-*-* x86_64-*-* } } } } */ diff --git a/gcc/tree-vect-patterns.c b/gcc/tree-vect-patterns.c index 8a2221f9350..f0c515daaa2 100644 --- a/gcc/tree-vect-patterns.c +++ b/gcc/tree-vect-patterns.c @@ -3763,7 +3763,8 @@ vect_recog_mask_conversion_pattern (vec *stmts, tree *type_in, if (rhs_code != BIT_IOR_EXPR && rhs_code != BIT_XOR_EXPR - && rhs_code != BIT_AND_EXPR) + && rhs_code != BIT_AND_EXPR + && TREE_CODE_CLASS (rhs_code) != tcc_comparison) return NULL; rhs2 = gimple_assign_rhs2 (last_stmt); diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index c74f14f0205..5c655029dd5 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -7756,7 +7756,7 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, enum vect_def_type dts[2] = {vect_unknown_def_type, vect_unknown_def_type}; unsigned nunits; int ncopies; - enum tree_code code; + enum tree_code code, bitop1 = NOP_EXPR, bitop2 = NOP_EXPR; stmt_vec_info prev_stmt_info = NULL; int i, j; bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); @@ -7829,11 +7829,74 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, else if (nunits != TYPE_VECTOR_SUBPARTS (vectype)) return false; + /* Can't compare mask and non-mask types. */ + if (vectype1 && vectype2 + && (VECTOR_BOOLEAN_TYPE_P (vectype1) ^ VECTOR_BOOLEAN_TYPE_P (vectype2))) + return false; + + /* Boolean values may have another representation in vectors + and therefore we prefer bit operations over comparison for + them (which also works for scalar masks). We store opcodes + to use in bitop1 and bitop2. Statement is vectorized as + BITOP2 (rhs1 BITOP1 rhs2) or + rhs1 BITOP2 (BITOP1 rhs2) + depending on bitop1 and bitop2 arity. */ + if (VECTOR_BOOLEAN_TYPE_P (vectype)) + { + if (code == GT_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_AND_EXPR; + } + else if (code == GE_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_IOR_EXPR; + } + else if (code == LT_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_AND_EXPR; + std::swap (rhs1, rhs2); + } + else if (code == LE_EXPR) + { + bitop1 = BIT_NOT_EXPR; + bitop2 = BIT_IOR_EXPR; + std::swap (rhs1, rhs2); + } + else + { + bitop1 = BIT_XOR_EXPR; + if (code == EQ_EXPR) + bitop2 = BIT_NOT_EXPR; + } + } + if (!vec_stmt) { STMT_VINFO_TYPE (stmt_info) = comparison_vec_info_type; - vect_model_simple_cost (stmt_info, ncopies, dts, NULL, NULL); - return expand_vec_cmp_expr_p (vectype, mask_type); + vect_model_simple_cost (stmt_info, ncopies * (1 + (bitop2 != NOP_EXPR)), + dts, NULL, NULL); + if (bitop1 == NOP_EXPR) + return expand_vec_cmp_expr_p (vectype, mask_type); + else + { + machine_mode mode = TYPE_MODE (vectype); + optab optab; + + optab = optab_for_tree_code (bitop1, vectype, optab_default); + if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing) + return false; + + if (bitop2 != NOP_EXPR) + { + optab = optab_for_tree_code (bitop2, vectype, optab_default); + if (!optab || optab_handler (optab, mode) == CODE_FOR_nothing) + return false; + } + return true; + } } /* Transform. */ @@ -7890,8 +7953,31 @@ vectorizable_comparison (gimple *stmt, gimple_stmt_iterator *gsi, vec_rhs2 = vec_oprnds1[i]; new_temp = make_ssa_name (mask); - new_stmt = gimple_build_assign (new_temp, code, vec_rhs1, vec_rhs2); - vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (bitop1 == NOP_EXPR) + { + new_stmt = gimple_build_assign (new_temp, code, + vec_rhs1, vec_rhs2); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + } + else + { + if (bitop1 == BIT_NOT_EXPR) + new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs2); + else + new_stmt = gimple_build_assign (new_temp, bitop1, vec_rhs1, + vec_rhs2); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + if (bitop2 != NOP_EXPR) + { + tree res = make_ssa_name (mask); + if (bitop2 == BIT_NOT_EXPR) + new_stmt = gimple_build_assign (res, bitop2, new_temp); + else + new_stmt = gimple_build_assign (res, bitop2, vec_rhs1, + new_temp); + vect_finish_stmt_generation (stmt, new_stmt, gsi); + } + } if (slp_node) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); }