From 2953b72fdd6c7d812028a636dfadf1c0e89ca314 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Thu, 20 Jul 2017 18:36:18 +0200 Subject: [PATCH] re PR target/80846 (auto-vectorized AVX2 horizontal sum should narrow to 128b right away, to be more efficient for Ryzen and Intel) PR target/80846 * config/i386/i386.c (ix86_expand_vector_init_general): Handle V2TImode and V4TImode. (ix86_expand_vector_extract): Likewise. * config/i386/sse.md (VMOVE): Enable V4TImode even for just TARGET_AVX512F, instead of only for TARGET_AVX512BW. (ssescalarmode): Handle V4TImode and V2TImode. (VEC_EXTRACT_MODE): Add V4TImode and V2TImode. (*vec_extractv2ti, *vec_extractv4ti): New insns. (VEXTRACTI128_MODE): New mode iterator. (splitter for *vec_extractv?ti first element): New. (VEC_INIT_MODE): New mode iterator. (vec_init): Consolidate 3 expanders into one using VEC_INIT_MODE mode iterator. * gcc.target/i386/avx-pr80846.c: New test. * gcc.target/i386/avx2-pr80846.c: New test. * gcc.target/i386/avx512f-pr80846.c: New test. From-SVN: r250397 --- gcc/ChangeLog | 17 ++++ gcc/config/i386/i386.c | 22 +++++ gcc/config/i386/sse.md | 82 ++++++++++++++----- gcc/testsuite/ChangeLog | 7 ++ gcc/testsuite/gcc.target/i386/avx-pr80846.c | 39 +++++++++ gcc/testsuite/gcc.target/i386/avx2-pr80846.c | 5 ++ .../gcc.target/i386/avx512f-pr80846.c | 5 ++ 7 files changed, 155 insertions(+), 22 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/avx-pr80846.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr80846.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-pr80846.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index cb7e697d69b..9bc43b4dc88 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2017-07-20 Jakub Jelinek + + PR target/80846 + * config/i386/i386.c (ix86_expand_vector_init_general): Handle + V2TImode and V4TImode. + (ix86_expand_vector_extract): Likewise. + * config/i386/sse.md (VMOVE): Enable V4TImode even for just + TARGET_AVX512F, instead of only for TARGET_AVX512BW. + (ssescalarmode): Handle V4TImode and V2TImode. + (VEC_EXTRACT_MODE): Add V4TImode and V2TImode. + (*vec_extractv2ti, *vec_extractv4ti): New insns. + (VEXTRACTI128_MODE): New mode iterator. + (splitter for *vec_extractv?ti first element): New. + (VEC_INIT_MODE): New mode iterator. + (vec_init): Consolidate 3 expanders into one using + VEC_INIT_MODE mode iterator. + 2017-07-20 Alexander Monakov * lra-assigns.c (pseudo_compare_func): Fix comparison step based on diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index eac76815287..ca29135d39f 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -44118,6 +44118,26 @@ ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode, ix86_expand_vector_init_concat (mode, target, ops, n); return; + case V2TImode: + for (i = 0; i < 2; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + op0 = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, op0, ops, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + + case V4TImode: + for (i = 0; i < 4; i++) + ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i)); + ops[4] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2); + ops[5] = gen_reg_rtx (V4DImode); + ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2); + op0 = gen_reg_rtx (V8DImode); + ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2); + emit_move_insn (target, gen_lowpart (GET_MODE (target), op0)); + return; + case V32QImode: half_mode = V16QImode; goto half; @@ -44659,6 +44679,8 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt) case V2DFmode: case V2DImode: + case V2TImode: + case V4TImode: use_vec_extr = true; break; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index e2db3b17f05..56b7f436d5d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -175,7 +175,7 @@ (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI - (V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX") V1TI + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX") V1TI (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) @@ -687,7 +687,8 @@ (V16SI "SI") (V8SI "SI") (V4SI "SI") (V8DI "DI") (V4DI "DI") (V2DI "DI") (V16SF "SF") (V8SF "SF") (V4SF "SF") - (V8DF "DF") (V4DF "DF") (V2DF "DF")]) + (V8DF "DF") (V4DF "DF") (V2DF "DF") + (V4TI "TI") (V2TI "TI")]) ;; Mapping of vector modes to the 128bit modes (define_mode_attr ssexmmmode @@ -6920,15 +6921,6 @@ (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex") (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")]) -(define_expand "vec_init" - [(match_operand:V_128 0 "register_operand") - (match_operand 1)] - "TARGET_SSE" -{ - ix86_expand_vector_init (false, operands[0], operands[1]); - DONE; -}) - ;; Avoid combining registers from different units in a single alternative, ;; see comment above inline_secondary_memory_needed function in i386.c (define_insn "vec_set_0" @@ -7886,7 +7878,8 @@ (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF - (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF]) + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") V2DF + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) (define_expand "vec_extract" [(match_operand: 0 "register_operand") @@ -13734,6 +13727,50 @@ operands[1] = adjust_address (operands[1], mode, offs); }) +(define_insn "*vec_extractv2ti" + [(set (match_operand:TI 0 "nonimmediate_operand" "=xm,vm") + (vec_select:TI + (match_operand:V2TI 1 "register_operand" "x,v") + (parallel + [(match_operand:SI 2 "const_0_to_1_operand")])))] + "TARGET_AVX" + "@ + vextract%~128\t{%2, %1, %0|%0, %1, %2} + vextracti32x4\t{%2, %g1, %0|%0, %g1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "vex,evex") + (set_attr "mode" "OI")]) + +(define_insn "*vec_extractv4ti" + [(set (match_operand:TI 0 "nonimmediate_operand" "=vm") + (vec_select:TI + (match_operand:V4TI 1 "register_operand" "v") + (parallel + [(match_operand:SI 2 "const_0_to_3_operand")])))] + "TARGET_AVX512F" + "vextracti32x4\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sselog") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") + (set_attr "mode" "XI")]) + +(define_mode_iterator VEXTRACTI128_MODE + [(V4TI "TARGET_AVX512F") V2TI]) + +(define_split + [(set (match_operand:TI 0 "nonimmediate_operand") + (vec_select:TI + (match_operand:VEXTRACTI128_MODE 1 "register_operand") + (parallel [(const_int 0)])))] + "TARGET_AVX + && reload_completed + && (TARGET_AVX512VL || !EXT_REX_SSE_REG_P (operands[1]))" + [(set (match_dup 0) (match_dup 1))] + "operands[1] = gen_lowpart (TImode, operands[1]);") + ;; Turn SImode or DImode extraction from arbitrary SSE/AVX/AVX512F ;; vector modes into vec_extract*. (define_split @@ -18738,19 +18775,20 @@ mode); }) -(define_expand "vec_init" - [(match_operand:V_256 0 "register_operand") - (match_operand 1)] - "TARGET_AVX" -{ - ix86_expand_vector_init (false, operands[0], operands[1]); - DONE; -}) +;; Modes handled by vec_init patterns. +(define_mode_iterator VEC_INIT_MODE + [(V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI + (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI + (V16SI "TARGET_AVX512F") (V8SI "TARGET_AVX") V4SI + (V8DI "TARGET_AVX512F") (V4DI "TARGET_AVX") V2DI + (V16SF "TARGET_AVX512F") (V8SF "TARGET_AVX") V4SF + (V8DF "TARGET_AVX512F") (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2") + (V4TI "TARGET_AVX512F") (V2TI "TARGET_AVX")]) (define_expand "vec_init" - [(match_operand:VF48_I1248 0 "register_operand") + [(match_operand:VEC_INIT_MODE 0 "register_operand") (match_operand 1)] - "TARGET_AVX512F" + "TARGET_SSE" { ix86_expand_vector_init (false, operands[0], operands[1]); DONE; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 3fc1332e686..bafcb2c6382 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2017-07-20 Jakub Jelinek + + PR target/80846 + * gcc.target/i386/avx-pr80846.c: New test. + * gcc.target/i386/avx2-pr80846.c: New test. + * gcc.target/i386/avx512f-pr80846.c: New test. + 2017-07-20 Bin Cheng PR tree-optimization/81388 diff --git a/gcc/testsuite/gcc.target/i386/avx-pr80846.c b/gcc/testsuite/gcc.target/i386/avx-pr80846.c new file mode 100644 index 00000000000..338f01039f2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx-pr80846.c @@ -0,0 +1,39 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx -mno-avx2" } */ + +typedef __int128 V __attribute__((vector_size (32))); +typedef long long W __attribute__((vector_size (32))); +typedef int X __attribute__((vector_size (16))); +typedef __int128 Y __attribute__((vector_size (64))); +typedef long long Z __attribute__((vector_size (64))); + +W f1 (__int128 x, __int128 y) { return (W) ((V) { x, y }); } +__int128 f2 (W x) { return ((V)x)[0]; } +__int128 f3 (W x) { return ((V)x)[1]; } +W f4 (X x, X y) { union { X x; __int128 i; } u = { .x = x }, v = { .x = y }; return (W) ((V) { u.i, v.i }); } +X f5 (W x) { return (X)(((V)x)[0]); } +X f6 (W x) { return (X)(((V)x)[1]); } +W f7 (void) { return (W) ((V) { 2, 3 }); } +W f8 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { u.i, 3 }); } +W f9 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { 2, u.i }); } +W f10 (X x) { union { X x; __int128 i; } u = { .x = x }; return (W) ((V) { u.i, u.i }); } +#ifdef __AVX512F__ +Z f11 (__int128 x, __int128 y, __int128 z, __int128 a) { return (Z) ((Y) { x, y, z, a }); } +__int128 f12 (Z x) { return ((Y)x)[0]; } +__int128 f13 (Z x) { return ((Y)x)[1]; } +__int128 f14 (Z x) { return ((Y)x)[2]; } +__int128 f15 (Z x) { return ((Y)x)[3]; } +Z f16 (X x, X y, X z, X a) { union { X x; __int128 i; } u = { .x = x }, v = { .x = y }, w = { .x = z }, t = { .x = a }; + return (Z) ((Y) { u.i, v.i, w.i, t.i }); } +X f17 (Z x) { return (X)(((Y)x)[0]); } +X f18 (Z x) { return (X)(((Y)x)[1]); } +X f19 (Z x) { return (X)(((Y)x)[2]); } +X f20 (Z x) { return (X)(((Y)x)[3]); } +Z f21 (void) { return (Z) ((Y) { 2, 3, 4, 5 }); } +Z f22 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { u.i, 3, 4, 5 }); } +Z f23 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, u.i, 4, 5 }); } +Z f24 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, 3, u.i, 5 }); } +Z f25 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { 2, 3, 4, u.i }); } +Z f26 (X x) { union { X x; __int128 i; } u = { .x = x }; return (Z) ((Y) { u.i, u.i, u.i, u.i }); } +#endif diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr80846.c b/gcc/testsuite/gcc.target/i386/avx2-pr80846.c new file mode 100644 index 00000000000..907fd4f7b62 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-pr80846.c @@ -0,0 +1,5 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx2 -mno-avx512f" } */ + +#include "avx-pr80846.c" diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr80846.c b/gcc/testsuite/gcc.target/i386/avx512f-pr80846.c new file mode 100644 index 00000000000..c32c9762cf9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-pr80846.c @@ -0,0 +1,5 @@ +/* PR target/80846 */ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx512f" } */ + +#include "avx-pr80846.c" -- 2.30.2