From b7b3378f91c0641f2ef4d88db22af62a571c9359 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 5 Feb 2020 15:38:49 +0100 Subject: [PATCH] i386: Omit clobbers from vzeroupper until final [PR92190] As mentioned in the PR, the CLOBBERs in vzeroupper are added there even for registers that aren't ever live in the function before and break the prologue/epilogue expansion with ms ABI (normal ABIs are fine, as they consider all [xyz]mm registers call clobbered, but the ms ABI considers xmm0-15 call used but the bits above low 128 ones call clobbered). The following patch fixes it by not adding the clobbers during vzeroupper pass (before pro_and_epilogue), but adding them for -fipa-ra purposes only during the final output. Perhaps we could add some CLOBBERs early (say for df_regs_ever_live_p regs that aren't live in the live_regs bitmap, or depending on the ABI either add all of them immediately, or for ms ABI add CLOBBERs for xmm0-xmm5 if they don't have a SET) and add the rest later. And the addition could be perhaps done at other spots, e.g. in an epilogue_completed guarded splitter. 2020-02-05 Jakub Jelinek PR target/92190 * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper): Only include sets and not clobbers in the vzeroupper pattern. * config/i386/sse.md (*avx_vzeroupper): Require in insn condition that the parallel has 17 (64-bit) or 9 (32-bit) elts. (*avx_vzeroupper_1): New define_insn_and_split. * gcc.target/i386/pr92190.c: New test. --- gcc/ChangeLog | 7 +++++ gcc/config/i386/i386-features.c | 29 ++++++++++-------- gcc/config/i386/sse.md | 40 ++++++++++++++++++++++++- gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/gcc.target/i386/pr92190.c | 19 ++++++++++++ 5 files changed, 86 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr92190.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index cb4132f955e..a5ae52acbb6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,12 @@ 2020-02-05 Jakub Jelinek + PR target/92190 + * config/i386/i386-features.c (ix86_add_reg_usage_to_vzeroupper): Only + include sets and not clobbers in the vzeroupper pattern. + * config/i386/sse.md (*avx_vzeroupper): Require in insn condition that + the parallel has 17 (64-bit) or 9 (32-bit) elts. + (*avx_vzeroupper_1): New define_insn_and_split. + PR target/92190 * recog.c (pass_split_after_reload::gate): For STACK_REGS targets, don't run when !optimize. diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c index b49e6f8d408..6919c839605 100644 --- a/gcc/config/i386/i386-features.c +++ b/gcc/config/i386/i386-features.c @@ -1764,29 +1764,32 @@ convert_scalars_to_vector (bool timode_p) (set (reg:V2DF R) (reg:V2DF R)) - which preserves the low 128 bits but clobbers the upper bits. - For a dead register we just use: - - (clobber (reg:V2DF R)) - - which invalidates any previous contents of R and stops R from becoming - live across the vzeroupper in future. */ + which preserves the low 128 bits but clobbers the upper bits. */ static void ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) { rtx pattern = PATTERN (insn); unsigned int nregs = TARGET_64BIT ? 16 : 8; - rtvec vec = rtvec_alloc (nregs + 1); - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); + unsigned int npats = nregs; for (unsigned int i = 0; i < nregs; ++i) { unsigned int regno = GET_SSE_REGNO (i); + if (!bitmap_bit_p (live_regs, regno)) + npats--; + } + if (npats == 0) + return; + rtvec vec = rtvec_alloc (npats + 1); + RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); + for (unsigned int i = 0, j = 0; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + if (!bitmap_bit_p (live_regs, regno)) + continue; rtx reg = gen_rtx_REG (V2DImode, regno); - if (bitmap_bit_p (live_regs, regno)) - RTVEC_ELT (vec, i + 1) = gen_rtx_SET (reg, reg); - else - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + ++j; + RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); } XVEC (pattern, 0) = vec; df_insn_rescan (insn); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 46f00e3d007..ac4cf5be686 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -19818,7 +19818,7 @@ (define_insn "*avx_vzeroupper" [(match_parallel 0 "vzeroupper_pattern" [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] - "TARGET_AVX" + "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" "vzeroupper" [(set_attr "type" "sse") (set_attr "modrm" "0") @@ -19827,6 +19827,44 @@ (set_attr "btver2_decode" "vector") (set_attr "mode" "OI")]) +(define_insn_and_split "*avx_vzeroupper_1" + [(match_parallel 0 "vzeroupper_pattern" + [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] + "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" + "#" + "&& epilogue_completed" + [(match_dup 0)] +{ + /* For IPA-RA purposes, make it clear the instruction clobbers + even XMM registers not mentioned explicitly in the pattern. */ + unsigned int nregs = TARGET_64BIT ? 16 : 8; + unsigned int npats = XVECLEN (operands[0], 0); + rtvec vec = rtvec_alloc (nregs + 1); + RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); + for (unsigned int i = 0, j = 1; i < nregs; ++i) + { + unsigned int regno = GET_SSE_REGNO (i); + if (j < npats + && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) + { + RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); + j++; + } + else + { + rtx reg = gen_rtx_REG (V2DImode, regno); + RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); + } + } + operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); +} + [(set_attr "type" "sse") + (set_attr "modrm" "0") + (set_attr "memory" "none") + (set_attr "prefix" "vex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "OI")]) + (define_mode_attr pbroadcast_evex_isa [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 8ee124ba782..ff47b94fc38 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2020-02-05 Jakub Jelinek + + PR target/92190 + * gcc.target/i386/pr92190.c: New test. + 2020-02-05 Richard Biener PR testsuite/92177 diff --git a/gcc/testsuite/gcc.target/i386/pr92190.c b/gcc/testsuite/gcc.target/i386/pr92190.c new file mode 100644 index 00000000000..c13c515e35c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92190.c @@ -0,0 +1,19 @@ +/* PR target/92190 */ +/* { dg-do compile { target { *-*-linux* && lp64 } } } */ +/* { dg-options "-mabi=ms -O2 -mavx512f" } */ + +typedef char VC __attribute__((vector_size (16))); +typedef int VI __attribute__((vector_size (16 * sizeof 0))); +VC a; +VI b; +void bar (VI); +void baz (VC); + +void +foo (void) +{ + VC k = a; + VI n = b; + bar (n); + baz (k); +} -- 2.30.2