From: Richard Biener Date: Thu, 28 Nov 2019 12:22:04 +0000 (+0000) Subject: re PR tree-optimization/92645 (Hand written vector code is 450 times slower when... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=78307657cf9675bc4aa2e77561c823834714b4c8;p=gcc.git re PR tree-optimization/92645 (Hand written vector code is 450 times slower when compiled with GCC compared to Clang) 2019-11-28 Richard Biener PR tree-optimization/92645 * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle conversions inside a mode class. Remove restriction on preserving the element size. (simplify_vector_constructor): Deal with the above and for identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR and VEC_PACK_TRUNC_EXPR. * gcc.target/i386/pr92645-4.c: New testcase. From-SVN: r278806 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b1c424000b1..d4a66fd0e58 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2019-11-28 Richard Biener + + PR tree-optimization/92645 + * tree-ssa-forwprop.c (get_bit_field_ref_def): Also handle + conversions inside a mode class. Remove restriction on + preserving the element size. + (simplify_vector_constructor): Deal with the above and for + identity permutes also try using VEC_UNPACK_[FLOAT_]LO_EXPR + and VEC_PACK_TRUNC_EXPR. + 2019-11-28 Georg-Johann Lay Must use push insn to pass varargs arguments of DFmode because diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 969c8bda369..a0fdcd5d9de 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-11-28 Richard Biener + + PR tree-optimization/92645 + * gcc.target/i386/pr92645-4.c: New testcase. + 2019-11-28 Christophe Lyon * gcc.target/arm/asm-flag-4.c: Use -mfloat-abi=softfp. diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c new file mode 100644 index 00000000000..788a97ed117 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92645-4.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx2 -fdump-tree-optimized -Wno-psabi" } */ + +typedef unsigned int u32v4 __attribute__((vector_size(16))); +typedef unsigned short u16v16 __attribute__((vector_size(32))); +typedef unsigned char u8v16 __attribute__((vector_size(16))); + +union vec128 { + u8v16 u8; + u32v4 u32; +}; + +#define memcpy __builtin_memcpy + +static u16v16 zxt(u8v16 x) +{ + return (u16v16) { + x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + }; +} + +static u8v16 narrow(u16v16 x) +{ + return (u8v16) { + x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], + x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + }; +} + +void f(char *dst, char *src, unsigned long n, unsigned c) +{ + unsigned ia = 255 - (c >> 24); + ia += ia >> 7; + + union vec128 c4 = {0}, ia16 = {0}; + c4.u32 += c; + ia16.u8 += (unsigned char)ia; + + u16v16 c16 = (zxt(c4.u8) << 8) + 128; + + for (; n; src += 16, dst += 16, n -= 4) { + union vec128 s; + memcpy(&s, src, sizeof s); + s.u8 = narrow((zxt(s.u8)*zxt(ia16.u8) + c16) >> 8); + memcpy(dst, &s, sizeof s); + } +} + +/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 3 "optimized" } } */ +/* We're missing an opportunity to, after later optimizations, combine + a uniform CTOR with a vec_unpack_lo_expr to a CTOR on a converted + element. */ +/* { dg-final { scan-tree-dump-times "vec_unpack_lo" 2 "optimized" { xfail *-*-* } } } */ +/* { dg-final { scan-tree-dump-times "VEC_PACK_TRUNC" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "optimized" } } */ diff --git a/gcc/tree-ssa-forwprop.c b/gcc/tree-ssa-forwprop.c index f95b05b2832..b275a637347 100644 --- a/gcc/tree-ssa-forwprop.c +++ b/gcc/tree-ssa-forwprop.c @@ -2004,16 +2004,12 @@ get_bit_field_ref_def (tree val, enum tree_code &conv_code) return NULL_TREE; enum tree_code code = gimple_assign_rhs_code (def_stmt); if (code == FLOAT_EXPR - || code == FIX_TRUNC_EXPR) + || code == FIX_TRUNC_EXPR + || CONVERT_EXPR_CODE_P (code)) { tree op1 = gimple_assign_rhs1 (def_stmt); if (conv_code == ERROR_MARK) - { - if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (val))), - GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (op1))))) - return NULL_TREE; - conv_code = code; - } + conv_code = code; else if (conv_code != code) return NULL_TREE; if (TREE_CODE (op1) != SSA_NAME) @@ -2078,9 +2074,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) && VECTOR_TYPE_P (TREE_TYPE (ref)) && useless_type_conversion_p (TREE_TYPE (op1), TREE_TYPE (TREE_TYPE (ref))) - && known_eq (bit_field_size (op1), elem_size) && constant_multiple_p (bit_field_offset (op1), - elem_size, &elem) + bit_field_size (op1), &elem) && TYPE_VECTOR_SUBPARTS (TREE_TYPE (ref)).is_constant (&refnelts)) { unsigned int j; @@ -2153,7 +2148,83 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) if (conv_code != ERROR_MARK && !supportable_convert_operation (conv_code, type, conv_src_type, &conv_code)) - return false; + { + /* Only few targets implement direct conversion patterns so try + some simple special cases via VEC_[UN]PACK[_FLOAT]_LO_EXPR. */ + optab optab; + tree halfvectype, dblvectype; + if (CONVERT_EXPR_CODE_P (conv_code) + && (2 * TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0]))) + == TYPE_PRECISION (TREE_TYPE (type))) + && mode_for_vector (as_a + (TYPE_MODE (TREE_TYPE (TREE_TYPE (orig[0])))), + nelts * 2).exists () + && (dblvectype + = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), + nelts * 2)) + && (optab = optab_for_tree_code (FLOAT_TYPE_P (TREE_TYPE (type)) + ? VEC_UNPACK_FLOAT_LO_EXPR + : VEC_UNPACK_LO_EXPR, + dblvectype, + optab_default)) + && (optab_handler (optab, TYPE_MODE (dblvectype)) + != CODE_FOR_nothing)) + { + gimple_seq stmts = NULL; + tree dbl; + if (refnelts == nelts) + { + /* ??? Paradoxical subregs don't exist, so insert into + the lower half of a wider zero vector. */ + dbl = gimple_build (&stmts, BIT_INSERT_EXPR, dblvectype, + build_zero_cst (dblvectype), orig[0], + bitsize_zero_node); + } + else if (refnelts == 2 * nelts) + dbl = orig[0]; + else + dbl = gimple_build (&stmts, BIT_FIELD_REF, dblvectype, + orig[0], TYPE_SIZE (dblvectype), + bitsize_zero_node); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops (gsi, + FLOAT_TYPE_P (TREE_TYPE (type)) + ? VEC_UNPACK_FLOAT_LO_EXPR + : VEC_UNPACK_LO_EXPR, + dbl); + } + else if (CONVERT_EXPR_CODE_P (conv_code) + && (TYPE_PRECISION (TREE_TYPE (TREE_TYPE (orig[0]))) + == 2 * TYPE_PRECISION (TREE_TYPE (type))) + && mode_for_vector (as_a + (TYPE_MODE + (TREE_TYPE (TREE_TYPE (orig[0])))), + nelts / 2).exists () + && (halfvectype + = build_vector_type (TREE_TYPE (TREE_TYPE (orig[0])), + nelts / 2)) + && (optab = optab_for_tree_code (VEC_PACK_TRUNC_EXPR, + halfvectype, + optab_default)) + && (optab_handler (optab, TYPE_MODE (halfvectype)) + != CODE_FOR_nothing)) + { + gimple_seq stmts = NULL; + tree low = gimple_build (&stmts, BIT_FIELD_REF, halfvectype, + orig[0], TYPE_SIZE (halfvectype), + bitsize_zero_node); + tree hig = gimple_build (&stmts, BIT_FIELD_REF, halfvectype, + orig[0], TYPE_SIZE (halfvectype), + TYPE_SIZE (halfvectype)); + gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); + gimple_assign_set_rhs_with_ops (gsi, VEC_PACK_TRUNC_EXPR, + low, hig); + } + else + return false; + update_stmt (gsi_stmt (*gsi)); + return true; + } if (nelts != refnelts) { gassign *lowpart @@ -2178,9 +2249,8 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi) ? perm_type : build_vector_type (TREE_TYPE (perm_type), nelts)); if (conv_code != ERROR_MARK - && (!supportable_convert_operation (conv_code, type, conv_src_type, - &conv_code) - || conv_code == CALL_EXPR)) + && !supportable_convert_operation (conv_code, type, conv_src_type, + &conv_code)) return false; /* Now that we know the number of elements of the source build the