From fdab73dc76d4551c652a3f3d686e765e637c95d9 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 4 May 2016 23:13:13 +0200 Subject: [PATCH] re PR target/70873 ([7 Regressio] 20% performance regression at 482.sphinx3 after r235442 with -O2 -m32 on Haswell.) PR target/70873 * config/i386/i386.md (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2): Change to post-epilogue_completed late splitter. Use sse_reg_operand as operand 0 predicate. (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2): Ditto. (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2): Ditto. Emit the pattern using RTX. (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter): Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG. (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter): Ditto. (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use sse_reg_operand as operand 0 predicate. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2): Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg instead of gen_rtx_REG. (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2): Ditto. From-SVN: r235906 --- gcc/ChangeLog | 32 +++++++++++-- gcc/config/i386/i386.md | 101 +++++++++++++++++++--------------------- 2 files changed, 77 insertions(+), 56 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 45e90874387..772dd37c4ff 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,29 @@ +2016-05-04 Uros Bizjak + + PR target/70873 + * config/i386/i386.md + (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2): + Change to post-epilogue_completed late splitter. Use sse_reg_operand + as operand 0 predicate. + (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2): + Ditto. + (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2): + Ditto. Emit the pattern using RTX. + + (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter): + Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in + the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG. + (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter): + Ditto. + (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use + sse_reg_operand as operand 0 predicate. + + (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2): + Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg + instead of gen_rtx_REG. + (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2): + Ditto. + 2016-05-04 Segher Boessenkool * function.c (emit_use_return_register_into_block): Delete. @@ -94,8 +120,7 @@ * match.pd: Add BIT_FIELD_REF canonicalizations and vector constructor simplifications. - * fold-const.c (fold_ternary_loc): Remove duplicate functionality - here. + * fold-const.c (fold_ternary_loc): Remove duplicate functionality here. 2016-05-04 Oleg Endo @@ -219,8 +244,7 @@ 2016-05-03 Jakub Jelinek * config/i386/i386.md (*truncdfsf_mixed, *truncdfsf_i387, - *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead - of x. + *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead of x. 2016-05-03 Richard Biener diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index ba1ff8be5da..dd56b0523e8 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4231,12 +4231,12 @@ that might lead to ICE on 32bit target. The sequence unlikely combine anyway. */ (define_split - [(set (match_operand:DF 0 "register_operand") + [(set (match_operand:DF 0 "sse_reg_operand") (float_extend:DF (match_operand:SF 1 "nonimmediate_operand")))] "TARGET_USE_VECTOR_FP_CONVERTS && optimize_insn_for_speed_p () - && reload_completed && SSE_REG_P (operands[0]) + && reload_completed && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" [(set (match_dup 2) @@ -4253,13 +4253,11 @@ { /* If it is unsafe to overwrite upper half of source, we need to move to destination and unpack there. */ - if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER - || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4) - && true_regnum (operands[0]) != true_regnum (operands[1])) + if (REGNO (operands[0]) != REGNO (operands[1]) || (EXT_REX_SSE_REG_P (operands[1]) && !TARGET_AVX512VL)) { - rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0])); + rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode); emit_move_insn (tmp, operands[1]); } else @@ -4267,7 +4265,7 @@ /* FIXME: vec_interleave_lowv4sf for AVX512VL should allow =v, v, then vbroadcastss will be only needed for AVX512F without AVX512VL. */ - if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3]))) + if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3]))) emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3], operands[3])); else @@ -4283,15 +4281,14 @@ ;; It's more profitable to split and then extend in the same register. (define_peephole2 - [(set (match_operand:DF 0 "register_operand") + [(set (match_operand:DF 0 "sse_reg_operand") (float_extend:DF (match_operand:SF 1 "memory_operand")))] "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS - && optimize_insn_for_speed_p () - && SSE_REG_P (operands[0])" + && optimize_insn_for_speed_p ()" [(set (match_dup 2) (match_dup 1)) (set (match_dup 0) (float_extend:DF (match_dup 2)))] - "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));") + "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);") (define_insn "*extendsfdf2" [(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v") @@ -4390,12 +4387,12 @@ that might lead to ICE on 32bit target. The sequence unlikely combine anyway. */ (define_split - [(set (match_operand:SF 0 "register_operand") + [(set (match_operand:SF 0 "sse_reg_operand") (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand")))] "TARGET_USE_VECTOR_FP_CONVERTS && optimize_insn_for_speed_p () - && reload_completed && SSE_REG_P (operands[0]) + && reload_completed && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" [(set (match_dup 2) @@ -4413,9 +4410,7 @@ if (REG_P (operands[1])) { if (!TARGET_SSE3 - && true_regnum (operands[0]) != true_regnum (operands[1]) - && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER - || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8)) + && REGNO (operands[0]) != REGNO (operands[1])) { rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode); emit_move_insn (tmp, operands[1]); @@ -4432,15 +4427,14 @@ ;; It's more profitable to split and then extend in the same register. (define_peephole2 - [(set (match_operand:SF 0 "register_operand") + [(set (match_operand:SF 0 "sse_reg_operand") (float_truncate:SF (match_operand:DF 1 "memory_operand")))] "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS - && optimize_insn_for_speed_p () - && SSE_REG_P (operands[0])" + && optimize_insn_for_speed_p ()" [(set (match_dup 2) (match_dup 1)) (set (match_dup 0) (float_truncate:SF (match_dup 2)))] - "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));") + "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);") (define_expand "truncdfsf2_with_temp" [(parallel [(set (match_operand:SF 0) @@ -4547,7 +4541,7 @@ "reload_completed" [(set (match_dup 2) (match_dup 1)) (set (match_dup 0) (match_dup 2))] - "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));") + "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));") ;; Conversion from XFmode to {SF,DF}mode @@ -5153,11 +5147,11 @@ ;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs ;; alternative in sse2_loadld. (define_split - [(set (match_operand:MODEF 0 "register_operand") + [(set (match_operand:MODEF 0 "sse_reg_operand") (float:MODEF (match_operand:SI 1 "nonimmediate_operand")))] - "TARGET_SSE2 && TARGET_SSE_MATH - && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun) - && reload_completed && SSE_REG_P (operands[0]) + "TARGET_USE_VECTOR_CONVERTS + && optimize_function_for_speed_p (cfun) + && reload_completed && (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC) && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" @@ -5176,41 +5170,43 @@ DONE; }) -;; Avoid partial SSE register dependency stalls +;; Avoid partial SSE register dependency stalls. This splitter should split +;; late in the pass sequence (after register rename pass), so allocated +;; registers won't change anymore + (define_split - [(set (match_operand:MODEF 0 "register_operand") + [(set (match_operand:MODEF 0 "sse_reg_operand") (float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))] - "TARGET_SSE2 && TARGET_SSE_MATH - && TARGET_SSE_PARTIAL_REG_DEPENDENCY + "TARGET_SSE_PARTIAL_REG_DEPENDENCY && optimize_function_for_speed_p (cfun) - && reload_completed && SSE_REG_P (operands[0]) + && epilogue_completed && (!EXT_REX_SSE_REG_P (operands[0]) || TARGET_AVX512VL)" - [(const_int 0)] + [(set (match_dup 0) + (vec_merge: + (vec_duplicate: + (float:MODEF + (match_dup 1))) + (match_dup 0) + (const_int 1)))] { const machine_mode vmode = mode; - const machine_mode mode = mode; - rtx t, op0 = lowpart_subreg (vmode, operands[0], mode); - - emit_move_insn (op0, CONST0_RTX (vmode)); - t = gen_rtx_FLOAT (mode, operands[1]); - t = gen_rtx_VEC_DUPLICATE (vmode, t); - t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx); - emit_insn (gen_rtx_SET (op0, t)); - DONE; + operands[0] = lowpart_subreg (vmode, operands[0], mode); + emit_move_insn (operands[0], CONST0_RTX (vmode)); }) -;; Break partial reg stall for cvtsd2ss. +;; Break partial reg stall for cvtsd2ss. This splitter should split +;; late in the pass sequence (after register rename pass), +;; so allocated registers won't change anymore. -(define_peephole2 - [(set (match_operand:SF 0 "register_operand") +(define_split + [(set (match_operand:SF 0 "sse_reg_operand") (float_truncate:SF (match_operand:DF 1 "nonimmediate_operand")))] - "TARGET_SSE2 && TARGET_SSE_MATH - && TARGET_SSE_PARTIAL_REG_DEPENDENCY + "TARGET_SSE_PARTIAL_REG_DEPENDENCY && optimize_function_for_speed_p (cfun) - && SSE_REG_P (operands[0]) + && epilogue_completed && (!SSE_REG_P (operands[1]) || REGNO (operands[0]) != REGNO (operands[1])) && (!EXT_REX_SSE_REG_P (operands[0]) @@ -5228,16 +5224,17 @@ emit_move_insn (operands[0], CONST0_RTX (V4SFmode)); }) -;; Break partial reg stall for cvtss2sd. +;; Break partial reg stall for cvtss2sd. This splitter should split +;; late in the pass sequence (after register rename pass), +;; so allocated registers won't change anymore. -(define_peephole2 - [(set (match_operand:DF 0 "register_operand") +(define_split + [(set (match_operand:DF 0 "sse_reg_operand") (float_extend:DF (match_operand:SF 1 "nonimmediate_operand")))] - "TARGET_SSE2 && TARGET_SSE_MATH - && TARGET_SSE_PARTIAL_REG_DEPENDENCY + "TARGET_SSE_PARTIAL_REG_DEPENDENCY && optimize_function_for_speed_p (cfun) - && SSE_REG_P (operands[0]) + && epilogue_completed && (!SSE_REG_P (operands[1]) || REGNO (operands[0]) != REGNO (operands[1])) && (!EXT_REX_SSE_REG_P (operands[0]) -- 2.30.2