From: Jakub Jelinek Date: Sat, 15 Dec 2018 11:02:28 +0000 (+0100) Subject: re PR tree-optimization/88464 (AVX-512 vectorization of masked scatter failing with... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b1985ca02ea01cd32de23be109672a7ebf593a46;p=gcc.git re PR tree-optimization/88464 (AVX-512 vectorization of masked scatter failing with "not suitable for scatter store") PR tree-optimization/88464 PR target/88498 * tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING and mask with integral masktype, don't try to permute mask vectors, instead emit VEC_UNPACK_{LO,HI}_EXPR. Fix up NOP_EXPR operand. (vectorizable_store): Handle masked scatters with decl and integral mask type. (permute_vec_elements): Allow scalar_dest to be NULL. * config/i386/i386.c (ix86_get_builtin) : Use lowpart_subreg for masks. : Don't assume mask and src have to be the same. * gcc.target/i386/avx512f-pr88462-1.c: Rename to ... * gcc.target/i386/avx512f-pr88464-1.c: ... this. Fix up PR number. Expect 4 vectorized loops instead of 3. (f4): New function. * gcc.target/i386/avx512f-pr88462-2.c: Rename to ... * gcc.target/i386/avx512f-pr88464-2.c: ... this. Fix up PR number and #include. (avx512f_test): Prepare arguments for f4 and check the results. * gcc.target/i386/avx512f-pr88464-3.c: New test. * gcc.target/i386/avx512f-pr88464-4.c: New test. From-SVN: r267169 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index fe75781ecb9..f829e93cd49 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2018-12-15 Jakub Jelinek + + PR tree-optimization/88464 + PR target/88498 + * tree-vect-stmts.c (vect_build_gather_load_calls): For NARROWING + and mask with integral masktype, don't try to permute mask vectors, + instead emit VEC_UNPACK_{LO,HI}_EXPR. Fix up NOP_EXPR operand. + (vectorizable_store): Handle masked scatters with decl and integral + mask type. + (permute_vec_elements): Allow scalar_dest to be NULL. + * config/i386/i386.c (ix86_get_builtin) + : Use lowpart_subreg for masks. + : Don't assume mask and src have + to be the same. + 2018-12-15 Jan Hubicka * ipa.c (cgraph_build_static_cdtor_1): Add OPTIMIZATION and TARGET diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4599ca2a7d5..b6dea0c061d 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -37607,13 +37607,7 @@ rdseed_step: op0 = copy_to_mode_reg (GET_MODE (op0), op0); emit_insn (gen (half, op0)); op0 = half; - if (GET_MODE (op3) != VOIDmode) - { - if (!nonimmediate_operand (op3, GET_MODE (op3))) - op3 = copy_to_mode_reg (GET_MODE (op3), op3); - emit_insn (gen (half, op3)); - op3 = half; - } + op3 = lowpart_subreg (QImode, op3, HImode); break; case IX86_BUILTIN_GATHER3ALTDIV8SF: case IX86_BUILTIN_GATHER3ALTDIV8SI: @@ -37630,6 +37624,7 @@ rdseed_step: op0 = half; if (GET_MODE (op3) != VOIDmode) { + half = gen_reg_rtx (mode0); if (!nonimmediate_operand (op3, GET_MODE (op3))) op3 = copy_to_mode_reg (GET_MODE (op3), op3); emit_insn (gen (half, op3)); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 707d728bd29..d815e70a9c0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,18 @@ +2018-12-15 Jakub Jelinek + + PR tree-optimization/88464 + PR target/88498 + * gcc.target/i386/avx512f-pr88462-1.c: Rename to ... + * gcc.target/i386/avx512f-pr88464-1.c: ... this. Fix up PR number. + Expect 4 vectorized loops instead of 3. + (f4): New function. + * gcc.target/i386/avx512f-pr88462-2.c: Rename to ... + * gcc.target/i386/avx512f-pr88464-2.c: ... this. Fix up PR number + and #include. + (avx512f_test): Prepare arguments for f4 and check the results. + * gcc.target/i386/avx512f-pr88464-3.c: New test. + * gcc.target/i386/avx512f-pr88464-4.c: New test. + 2018-12-15 Paolo Carlini PR c++/84644 diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c deleted file mode 100644 index 00992276afa..00000000000 --- a/gcc/testsuite/gcc.target/i386/avx512f-pr88462-1.c +++ /dev/null @@ -1,35 +0,0 @@ -/* PR tree-optimization/88462 */ -/* { dg-do compile } */ -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ -/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 3 "vect" } } */ -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 3 "vect" } } */ - -__attribute__((noipa)) void -f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0) - a[i] = b[c[i]]; -} - -__attribute__((noipa)) void -f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0) - a[i] = b[c[i]]; -} - -__attribute__((noipa)) void -f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) -{ - int i; -#pragma GCC ivdep - for (i = 0; i < n; ++i) - if (a[i] > 10.0f) - a[i] = b[c[i]]; -} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c deleted file mode 100644 index 3b437c5849c..00000000000 --- a/gcc/testsuite/gcc.target/i386/avx512f-pr88462-2.c +++ /dev/null @@ -1,51 +0,0 @@ -/* PR tree-optimization/88462 */ -/* { dg-do run { target { avx512f } } } */ -/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ - -#include "avx512f-check.h" - -#include "avx512f-pr88462-1.c" - -static void -avx512f_test (void) -{ - double a[1024], b[1024]; - float c[1024], f[1024]; - int d[1024]; - long e[1024]; - int i; - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - a[i] = (i % 3) != 0 ? 15.0 : -5.0; - b[i] = 2 * i; - d[i] = (i % 3) ? 1023 - i : __INT_MAX__; - } - f1 (a, b, d, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) - abort (); - a[i] = (i % 3) != 1 ? 15.0 : -5.0; - b[i] = 3 * i; - e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; - } - f2 (a, b, e, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) - abort (); - c[i] = (i % 3) != 2 ? 15.0f : -5.0f; - d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; - f[i] = 4 * i; - } - f3 (c, f, d, 1024); - for (i = 0; i < 1024; i++) - { - asm volatile ("" : "+g" (i)); - if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) - abort (); - } -} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c new file mode 100644 index 00000000000..06d21bb0129 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c @@ -0,0 +1,45 @@ +/* PR tree-optimization/88464 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 -fdump-tree-vect-details" } */ +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 "vect" } } */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" } } */ + +__attribute__((noipa)) void +f1 (double * __restrict__ a, const double * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f2 (double * __restrict__ a, const double * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f3 (float * __restrict__ a, const float * __restrict__ b, const int * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0f) + a[i] = b[c[i]]; +} + +__attribute__((noipa)) void +f4 (float * __restrict__ a, const float * __restrict__ b, const long * __restrict__ c, int n) +{ + int i; +#pragma GCC ivdep + for (i = 0; i < n; ++i) + if (a[i] > 10.0f) + a[i] = b[c[i]]; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c new file mode 100644 index 00000000000..845bf509d82 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c @@ -0,0 +1,61 @@ +/* PR tree-optimization/88464 */ +/* { dg-do run { target { avx512f } } } */ +/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512" } */ + +#include "avx512f-check.h" + +#include "avx512f-pr88464-1.c" + +static void +avx512f_test (void) +{ + double a[1024], b[1024]; + float c[1024], f[1024]; + int d[1024]; + long e[1024]; + int i; + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + a[i] = (i % 3) != 0 ? 15.0 : -5.0; + b[i] = 2 * i; + d[i] = (i % 3) ? 1023 - i : __INT_MAX__; + } + f1 (a, b, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 0 ? (1023 - i) * 2.0 : -5.0)) + abort (); + a[i] = (i % 3) != 1 ? 15.0 : -5.0; + b[i] = 3 * i; + e[i] = (i % 3) != 1 ? 1023 - i : __LONG_MAX__; + } + f2 (a, b, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (a[i] != ((i % 3) != 1 ? (1023 - i) * 3.0 : -5.0)) + abort (); + c[i] = (i % 3) != 2 ? 15.0f : -5.0f; + d[i] = (i % 3) != 2 ? 1023 - i : __INT_MAX__; + f[i] = 4 * i; + } + f3 (c, f, d, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 2 ? (1023 - i) * 4.0f : -5.0f)) + abort (); + c[i] = (i % 3) != 0 ? 15.0f : -5.0f; + e[i] = (i % 3) != 0 ? 1023 - i : __INT_MAX__; + f[i] = 5 * i; + } + f4 (c, f, e, 1024); + for (i = 0; i < 1024; i++) + { + asm volatile ("" : "+g" (i)); + if (c[i] != ((i % 3) != 0 ? (1023 - i) * 5.0f : -5.0f)) + abort (); + } +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 3188bb8c351..589e018d1e2 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2655,6 +2655,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, if (mask && TREE_CODE (masktype) == INTEGER_TYPE) masktype = build_same_sized_truth_vector_type (srctype); + tree mask_halftype = masktype; tree perm_mask = NULL_TREE; tree mask_perm_mask = NULL_TREE; if (known_eq (nunits, gather_off_nunits)) @@ -2690,13 +2691,16 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, ncopies *= 2; - if (mask) + if (mask && masktype == real_masktype) { for (int i = 0; i < count; ++i) sel[i] = i | (count / 2); indices.new_vector (sel, 2, count); mask_perm_mask = vect_gen_perm_mask_checked (masktype, indices); } + else if (mask) + mask_halftype + = build_same_sized_truth_vector_type (gs_info->offset_vectype); } else gcc_unreachable (); @@ -2761,7 +2765,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, { if (j == 0) vec_mask = vect_get_vec_def_for_operand (mask, stmt_info); - else + else if (modifier != NARROW || (j & 1) == 0) vec_mask = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_mask); @@ -2779,17 +2783,27 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, mask_op = var; } } + if (modifier == NARROW && masktype != real_masktype) + { + var = vect_get_new_ssa_name (mask_halftype, vect_simple_var); + gassign *new_stmt + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR + : VEC_UNPACK_LO_EXPR, + mask_op); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_op = var; + } src_op = mask_op; } tree mask_arg = mask_op; if (masktype != real_masktype) { - tree utype; - if (TYPE_MODE (real_masktype) == TYPE_MODE (masktype)) + tree utype, optype = TREE_TYPE (mask_op); + if (TYPE_MODE (real_masktype) == TYPE_MODE (optype)) utype = real_masktype; else - utype = lang_hooks.types.type_for_mode (TYPE_MODE (masktype), 1); + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); var = vect_get_new_ssa_name (utype, vect_scalar_var); mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_op); gassign *new_stmt @@ -2801,7 +2815,7 @@ vect_build_gather_load_calls (stmt_vec_info stmt_info, gcc_assert (TYPE_PRECISION (utype) <= TYPE_PRECISION (real_masktype)); var = vect_get_new_ssa_name (real_masktype, vect_scalar_var); - new_stmt = gimple_build_assign (var, NOP_EXPR, utype); + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); vect_finish_stmt_generation (stmt_info, new_stmt, gsi); mask_arg = var; } @@ -6361,7 +6375,8 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, return false; } else if (memory_access_type != VMAT_LOAD_STORE_LANES - && (memory_access_type != VMAT_GATHER_SCATTER || gs_info.decl)) + && (memory_access_type != VMAT_GATHER_SCATTER + || (gs_info.decl && !VECTOR_BOOLEAN_TYPE_P (mask_vectype)))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -6419,7 +6434,9 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, src; tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); tree rettype, srctype, ptrtype, idxtype, masktype, scaletype; - tree ptr, mask, var, scale, perm_mask = NULL_TREE; + tree ptr, var, scale, vec_mask; + tree mask_arg = NULL_TREE, mask_op = NULL_TREE, perm_mask = NULL_TREE; + tree mask_halfvectype = mask_vectype; edge pe = loop_preheader_edge (loop); gimple_seq seq; basic_block new_bb; @@ -6460,6 +6477,10 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, perm_mask = vect_gen_perm_mask_checked (vectype, indices); gcc_assert (perm_mask != NULL_TREE); ncopies *= 2; + + if (mask) + mask_halfvectype + = build_same_sized_truth_vector_type (gs_info.offset_vectype); } else gcc_unreachable (); @@ -6482,10 +6503,11 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, gcc_assert (!new_bb); } - /* Currently we support only unconditional scatter stores, - so mask should be all ones. */ - mask = build_int_cst (masktype, -1); - mask = vect_init_vector (stmt_info, mask, masktype, NULL); + if (mask == NULL_TREE) + { + mask_arg = build_int_cst (masktype, -1); + mask_arg = vect_init_vector (stmt_info, mask_arg, masktype, NULL); + } scale = build_int_cst (scaletype, gs_info.scale); @@ -6494,36 +6516,46 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, { if (j == 0) { - src = vec_oprnd1 - = vect_get_vec_def_for_operand (op, stmt_info); - op = vec_oprnd0 - = vect_get_vec_def_for_operand (gs_info.offset, stmt_info); + src = vec_oprnd1 = vect_get_vec_def_for_operand (op, stmt_info); + op = vec_oprnd0 = vect_get_vec_def_for_operand (gs_info.offset, + stmt_info); + if (mask) + mask_op = vec_mask = vect_get_vec_def_for_operand (mask, + stmt_info); } else if (modifier != NONE && (j & 1)) { if (modifier == WIDEN) { - src = vec_oprnd1 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); + src + = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd1); op = permute_vec_elements (vec_oprnd0, vec_oprnd0, perm_mask, stmt_info, gsi); + if (mask) + mask_op + = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, + vec_mask); } else if (modifier == NARROW) { src = permute_vec_elements (vec_oprnd1, vec_oprnd1, perm_mask, stmt_info, gsi); - op = vec_oprnd0 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd0); } else gcc_unreachable (); } else { - src = vec_oprnd1 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd1); - op = vec_oprnd0 - = vect_get_vec_def_for_stmt_copy (vinfo, vec_oprnd0); + src = vec_oprnd1 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd1); + op = vec_oprnd0 = vect_get_vec_def_for_stmt_copy (vinfo, + vec_oprnd0); + if (mask) + mask_op = vec_mask = vect_get_vec_def_for_stmt_copy (vinfo, + vec_mask); } if (!useless_type_conversion_p (srctype, TREE_TYPE (src))) @@ -6550,8 +6582,45 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, op = var; } + if (mask) + { + tree utype; + mask_arg = mask_op; + if (modifier == NARROW) + { + var = vect_get_new_ssa_name (mask_halfvectype, + vect_simple_var); + gassign *new_stmt + = gimple_build_assign (var, (j & 1) ? VEC_UNPACK_HI_EXPR + : VEC_UNPACK_LO_EXPR, + mask_op); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + } + tree optype = TREE_TYPE (mask_arg); + if (TYPE_MODE (masktype) == TYPE_MODE (optype)) + utype = masktype; + else + utype = lang_hooks.types.type_for_mode (TYPE_MODE (optype), 1); + var = vect_get_new_ssa_name (utype, vect_scalar_var); + mask_arg = build1 (VIEW_CONVERT_EXPR, utype, mask_arg); + gassign *new_stmt + = gimple_build_assign (var, VIEW_CONVERT_EXPR, mask_arg); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + if (!useless_type_conversion_p (masktype, utype)) + { + gcc_assert (TYPE_PRECISION (utype) + <= TYPE_PRECISION (masktype)); + var = vect_get_new_ssa_name (masktype, vect_scalar_var); + new_stmt = gimple_build_assign (var, NOP_EXPR, mask_arg); + vect_finish_stmt_generation (stmt_info, new_stmt, gsi); + mask_arg = var; + } + } + gcall *new_stmt - = gimple_build_call (gs_info.decl, 5, ptr, mask, op, src, scale); + = gimple_build_call (gs_info.decl, 5, ptr, mask_arg, op, src, scale); stmt_vec_info new_stmt_info = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); @@ -7284,7 +7353,7 @@ permute_vec_elements (tree x, tree y, tree mask_vec, stmt_vec_info stmt_info, gimple *perm_stmt; tree scalar_dest = gimple_get_lhs (stmt_info->stmt); - if (TREE_CODE (scalar_dest) == SSA_NAME) + if (scalar_dest && TREE_CODE (scalar_dest) == SSA_NAME) perm_dest = vect_create_destination_var (scalar_dest, vectype); else perm_dest = vect_get_new_vect_var (vectype, vect_simple_var, NULL);