re PR target/70873 ([7 Regressio] 20% performance regression at 482.sphinx3 after...
authorUros Bizjak <uros@gcc.gnu.org>
Wed, 4 May 2016 21:13:13 +0000 (23:13 +0200)
committerUros Bizjak <uros@gcc.gnu.org>
Wed, 4 May 2016 21:13:13 +0000 (23:13 +0200)
PR target/70873
* config/i386/i386.md
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
Change to post-epilogue_completed late splitter.  Use sse_reg_operand
as operand 0 predicate.
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
Ditto.
(TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
Ditto.  Emit the pattern using RTX.

(TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
Use sse_reg_opreand as operand 0 predicate.  Do not use true_regnum in
the post-reload splitter.  Use lowpart_subreg instead of gen_rtx_REG.
(TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
Ditto.
(TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
sse_reg_operand as operand 0 predicate.

(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
Use sse_reg_opreand as operand 0 predicate.  Use lowpart_subreg
instead of gen_rtx_REG.
(TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
Ditto.

From-SVN: r235906

gcc/ChangeLog
gcc/config/i386/i386.md

index 45e90874387adc60448f10b89db09d17f39b4c14..772dd37c4ff9bd85a04d1968d8a1dc7422adbb2a 100644 (file)
@@ -1,3 +1,29 @@
+2016-05-04  Uros Bizjak  <ubizjak@gmail.com>
+
+       PR target/70873
+       * config/i386/i386.md
+       (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
+       Change to post-epilogue_completed late splitter.  Use sse_reg_operand
+       as operand 0 predicate.
+       (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
+       Ditto.
+       (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
+       Ditto.  Emit the pattern using RTX.
+
+       (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
+       Use sse_reg_opreand as operand 0 predicate.  Do not use true_regnum in
+       the post-reload splitter.  Use lowpart_subreg instead of gen_rtx_REG.
+       (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
+       Ditto.
+       (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
+       sse_reg_operand as operand 0 predicate.
+
+       (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
+       Use sse_reg_opreand as operand 0 predicate.  Use lowpart_subreg
+       instead of gen_rtx_REG.
+       (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
+       Ditto.
+
 2016-05-04  Segher Boessenkool  <segher@kernel.crashing.org>
 
        * function.c (emit_use_return_register_into_block): Delete.
 
        * match.pd: Add BIT_FIELD_REF canonicalizations and vector
        constructor simplifications.
-       * fold-const.c (fold_ternary_loc): Remove duplicate functionality
-       here.
+       * fold-const.c (fold_ternary_loc): Remove duplicate functionality here.
 
 2016-05-04  Oleg Endo  <olegendo@gcc.gnu.org>
 
 2016-05-03  Jakub Jelinek  <jakub@redhat.com>
 
        * config/i386/i386.md (*truncdfsf_mixed, *truncdfsf_i387,
-       *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead
-       of x.
+       *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead of x.
 
 2016-05-03  Richard Biener  <rguenther@suse.de>
 
index ba1ff8be5da957e0938d0d045078e4451276653a..dd56b0523e80955f41e6fe901f93d2ac851ba206 100644 (file)
    that might lead to ICE on 32bit target.  The sequence unlikely combine
    anyway.  */
 (define_split
-  [(set (match_operand:DF 0 "register_operand")
+  [(set (match_operand:DF 0 "sse_reg_operand")
         (float_extend:DF
          (match_operand:SF 1 "nonimmediate_operand")))]
   "TARGET_USE_VECTOR_FP_CONVERTS
    && optimize_insn_for_speed_p ()
-   && reload_completed && SSE_REG_P (operands[0])
+   && reload_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
    [(set (match_dup 2)
     {
       /* If it is unsafe to overwrite upper half of source, we need
         to move to destination and unpack there.  */
-      if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
-           || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
-          && true_regnum (operands[0]) != true_regnum (operands[1]))
+      if (REGNO (operands[0]) != REGNO (operands[1])
          || (EXT_REX_SSE_REG_P (operands[1])
              && !TARGET_AVX512VL))
        {
-         rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+         rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
          emit_move_insn (tmp, operands[1]);
        }
       else
       /* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
         =v, v, then vbroadcastss will be only needed for AVX512F without
         AVX512VL.  */
-      if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
+      if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
        emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
                                               operands[3]));
       else
 
 ;; It's more profitable to split and then extend in the same register.
 (define_peephole2
-  [(set (match_operand:DF 0 "register_operand")
+  [(set (match_operand:DF 0 "sse_reg_operand")
        (float_extend:DF
          (match_operand:SF 1 "memory_operand")))]
   "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
-   && optimize_insn_for_speed_p ()
-   && SSE_REG_P (operands[0])"
+   && optimize_insn_for_speed_p ()"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float_extend:DF (match_dup 2)))]
-  "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+  "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
 
 (define_insn "*extendsfdf2"
   [(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
    that might lead to ICE on 32bit target.  The sequence unlikely combine
    anyway.  */
 (define_split
-  [(set (match_operand:SF 0 "register_operand")
+  [(set (match_operand:SF 0 "sse_reg_operand")
         (float_truncate:SF
          (match_operand:DF 1 "nonimmediate_operand")))]
   "TARGET_USE_VECTOR_FP_CONVERTS
    && optimize_insn_for_speed_p ()
-   && reload_completed && SSE_REG_P (operands[0])
+   && reload_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
    [(set (match_dup 2)
   if (REG_P (operands[1]))
     {
       if (!TARGET_SSE3
-         && true_regnum (operands[0]) != true_regnum (operands[1])
-         && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
-             || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+         && REGNO (operands[0]) != REGNO (operands[1]))
        {
          rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
          emit_move_insn (tmp, operands[1]);
 
 ;; It's more profitable to split and then extend in the same register.
 (define_peephole2
-  [(set (match_operand:SF 0 "register_operand")
+  [(set (match_operand:SF 0 "sse_reg_operand")
        (float_truncate:SF
          (match_operand:DF 1 "memory_operand")))]
   "TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
-   && optimize_insn_for_speed_p ()
-   && SSE_REG_P (operands[0])"
+   && optimize_insn_for_speed_p ()"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (float_truncate:SF (match_dup 2)))]
-  "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+  "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
 
 (define_expand "truncdfsf2_with_temp"
   [(parallel [(set (match_operand:SF 0)
   "reload_completed"
   [(set (match_dup 2) (match_dup 1))
    (set (match_dup 0) (match_dup 2))]
-  "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+  "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
 
 ;; Conversion from XFmode to {SF,DF}mode
 
 ;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
 ;; alternative in sse2_loadld.
 (define_split
-  [(set (match_operand:MODEF 0 "register_operand")
+  [(set (match_operand:MODEF 0 "sse_reg_operand")
        (float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
-   && reload_completed && SSE_REG_P (operands[0])
+  "TARGET_USE_VECTOR_CONVERTS
+   && optimize_function_for_speed_p (cfun)
+   && reload_completed
    && (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
   DONE;
 })
 
-;; Avoid partial SSE register dependency stalls
+;; Avoid partial SSE register dependency stalls.  This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
+
 (define_split
-  [(set (match_operand:MODEF 0 "register_operand")
+  [(set (match_operand:MODEF 0 "sse_reg_operand")
        (float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && reload_completed && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!EXT_REX_SSE_REG_P (operands[0])
        || TARGET_AVX512VL)"
-  [(const_int 0)]
+  [(set (match_dup 0)
+       (vec_merge:<MODEF:ssevecmode>
+         (vec_duplicate:<MODEF:ssevecmode>
+           (float:MODEF
+             (match_dup 1)))
+         (match_dup 0)
+         (const_int 1)))]
 {
   const machine_mode vmode = <MODEF:ssevecmode>mode;
-  const machine_mode mode = <MODEF:MODE>mode;
-  rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
-
-  emit_move_insn (op0, CONST0_RTX (vmode));
 
-  t = gen_rtx_FLOAT (mode, operands[1]);
-  t = gen_rtx_VEC_DUPLICATE (vmode, t);
-  t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
-  emit_insn (gen_rtx_SET (op0, t));
-  DONE;
+  operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
+  emit_move_insn (operands[0], CONST0_RTX (vmode));
 })
 
-;; Break partial reg stall for cvtsd2ss.
+;; Break partial reg stall for cvtsd2ss.  This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
 
-(define_peephole2
-  [(set (match_operand:SF 0 "register_operand")
+(define_split
+  [(set (match_operand:SF 0 "sse_reg_operand")
         (float_truncate:SF
          (match_operand:DF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!SSE_REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
    && (!EXT_REX_SSE_REG_P (operands[0])
   emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
 })
 
-;; Break partial reg stall for cvtss2sd.
+;; Break partial reg stall for cvtss2sd.  This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
 
-(define_peephole2
-  [(set (match_operand:DF 0 "register_operand")
+(define_split
+  [(set (match_operand:DF 0 "sse_reg_operand")
         (float_extend:DF
           (match_operand:SF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_SSE_MATH
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+  "TARGET_SSE_PARTIAL_REG_DEPENDENCY
    && optimize_function_for_speed_p (cfun)
-   && SSE_REG_P (operands[0])
+   && epilogue_completed
    && (!SSE_REG_P (operands[1])
        || REGNO (operands[0]) != REGNO (operands[1]))
    && (!EXT_REX_SSE_REG_P (operands[0])