+2016-05-04 Uros Bizjak <ubizjak@gmail.com>
+
+ PR target/70873
+ * config/i386/i386.md
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_extend sf->df peephole2):
+ Change to post-epilogue_completed late splitter. Use sse_reg_operand
+ as operand 0 predicate.
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float_truncate df->sf peephole2):
+ Ditto.
+ (TARGET_SSE_PARTIAL_REG_DEPENDENCY float {si,di}->{sf,df} peephole2):
+ Ditto. Emit the pattern using RTX.
+
+ (TARGET_USE_VECTOR_FP_CONVERTS float_extend sf->df splitter):
+ Use sse_reg_opreand as operand 0 predicate. Do not use true_regnum in
+ the post-reload splitter. Use lowpart_subreg instead of gen_rtx_REG.
+ (TARGET_USE_VECTOR_FP_CONVERTS float_truncate df->sf splitter):
+ Ditto.
+ (TARGET_USE_VECTOR_CONVERTS float si->{sf,df} splitter): Use
+ sse_reg_operand as operand 0 predicate.
+
+ (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_extend sf->df peephole2):
+ Use sse_reg_opreand as operand 0 predicate. Use lowpart_subreg
+ instead of gen_rtx_REG.
+ (TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS float_truncate sf->df peephole2):
+ Ditto.
+
2016-05-04 Segher Boessenkool <segher@kernel.crashing.org>
* function.c (emit_use_return_register_into_block): Delete.
* match.pd: Add BIT_FIELD_REF canonicalizations and vector
constructor simplifications.
- * fold-const.c (fold_ternary_loc): Remove duplicate functionality
- here.
+ * fold-const.c (fold_ternary_loc): Remove duplicate functionality here.
2016-05-04 Oleg Endo <olegendo@gcc.gnu.org>
2016-05-03 Jakub Jelinek <jakub@redhat.com>
* config/i386/i386.md (*truncdfsf_mixed, *truncdfsf_i387,
- *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead
- of x.
+ *truncxfsf2_mixed, *truncxfdf2_mixed): Use v constraint instead of x.
2016-05-03 Richard Biener <rguenther@suse.de>
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
{
/* If it is unsafe to overwrite upper half of source, we need
to move to destination and unpack there. */
- if (((ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 4)
- && true_regnum (operands[0]) != true_regnum (operands[1]))
+ if (REGNO (operands[0]) != REGNO (operands[1])
|| (EXT_REX_SSE_REG_P (operands[1])
&& !TARGET_AVX512VL))
{
- rtx tmp = gen_rtx_REG (SFmode, true_regnum (operands[0]));
+ rtx tmp = lowpart_subreg (SFmode, operands[0], DFmode);
emit_move_insn (tmp, operands[1]);
}
else
/* FIXME: vec_interleave_lowv4sf for AVX512VL should allow
=v, v, then vbroadcastss will be only needed for AVX512F without
AVX512VL. */
- if (!EXT_REX_SSE_REGNO_P (true_regnum (operands[3])))
+ if (!EXT_REX_SSE_REGNO_P (REGNO (operands[3])))
emit_insn (gen_vec_interleave_lowv4sf (operands[3], operands[3],
operands[3]));
else
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_extend:DF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (SFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (SFmode, operands[0], DFmode);")
(define_insn "*extendsfdf2"
[(set (match_operand:DF 0 "nonimm_ssenomem_operand" "=f,m,v")
that might lead to ICE on 32bit target. The sequence unlikely combine
anyway. */
(define_split
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
"TARGET_USE_VECTOR_FP_CONVERTS
&& optimize_insn_for_speed_p ()
- && reload_completed && SSE_REG_P (operands[0])
+ && reload_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
[(set (match_dup 2)
if (REG_P (operands[1]))
{
if (!TARGET_SSE3
- && true_regnum (operands[0]) != true_regnum (operands[1])
- && (ORIGINAL_REGNO (operands[1]) < FIRST_PSEUDO_REGISTER
- || PSEUDO_REGNO_BYTES (ORIGINAL_REGNO (operands[1])) > 8))
+ && REGNO (operands[0]) != REGNO (operands[1]))
{
rtx tmp = lowpart_subreg (DFmode, operands[0], SFmode);
emit_move_insn (tmp, operands[1]);
;; It's more profitable to split and then extend in the same register.
(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "memory_operand")))]
"TARGET_SPLIT_MEM_OPND_FOR_FP_CONVERTS
- && optimize_insn_for_speed_p ()
- && SSE_REG_P (operands[0])"
+ && optimize_insn_for_speed_p ()"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (float_truncate:SF (match_dup 2)))]
- "operands[2] = gen_rtx_REG (DFmode, REGNO (operands[0]));")
+ "operands[2] = lowpart_subreg (DFmode, operands[0], SFmode);")
(define_expand "truncdfsf2_with_temp"
[(parallel [(set (match_operand:SF 0)
"reload_completed"
[(set (match_dup 2) (match_dup 1))
(set (match_dup 0) (match_dup 2))]
- "operands[1] = gen_rtx_REG (SFmode, true_regnum (operands[1]));")
+ "operands[1] = gen_rtx_REG (SFmode, REGNO (operands[1]));")
;; Conversion from XFmode to {SF,DF}mode
;; slots when !TARGET_INTER_UNIT_MOVES_TO_VEC disables the general_regs
;; alternative in sse2_loadld.
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SI 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_USE_VECTOR_CONVERTS && optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ "TARGET_USE_VECTOR_CONVERTS
+ && optimize_function_for_speed_p (cfun)
+ && reload_completed
&& (MEM_P (operands[1]) || TARGET_INTER_UNIT_MOVES_TO_VEC)
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
DONE;
})
-;; Avoid partial SSE register dependency stalls
+;; Avoid partial SSE register dependency stalls. This splitter should split
+;; late in the pass sequence (after register rename pass), so allocated
+;; registers won't change anymore
+
(define_split
- [(set (match_operand:MODEF 0 "register_operand")
+ [(set (match_operand:MODEF 0 "sse_reg_operand")
(float:MODEF (match_operand:SWI48 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && reload_completed && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!EXT_REX_SSE_REG_P (operands[0])
|| TARGET_AVX512VL)"
- [(const_int 0)]
+ [(set (match_dup 0)
+ (vec_merge:<MODEF:ssevecmode>
+ (vec_duplicate:<MODEF:ssevecmode>
+ (float:MODEF
+ (match_dup 1)))
+ (match_dup 0)
+ (const_int 1)))]
{
const machine_mode vmode = <MODEF:ssevecmode>mode;
- const machine_mode mode = <MODEF:MODE>mode;
- rtx t, op0 = lowpart_subreg (vmode, operands[0], mode);
-
- emit_move_insn (op0, CONST0_RTX (vmode));
- t = gen_rtx_FLOAT (mode, operands[1]);
- t = gen_rtx_VEC_DUPLICATE (vmode, t);
- t = gen_rtx_VEC_MERGE (vmode, t, op0, const1_rtx);
- emit_insn (gen_rtx_SET (op0, t));
- DONE;
+ operands[0] = lowpart_subreg (vmode, operands[0], <MODEF:MODE>mode);
+ emit_move_insn (operands[0], CONST0_RTX (vmode));
})
-;; Break partial reg stall for cvtsd2ss.
+;; Break partial reg stall for cvtsd2ss. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:SF 0 "register_operand")
+(define_split
+ [(set (match_operand:SF 0 "sse_reg_operand")
(float_truncate:SF
(match_operand:DF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])
emit_move_insn (operands[0], CONST0_RTX (V4SFmode));
})
-;; Break partial reg stall for cvtss2sd.
+;; Break partial reg stall for cvtss2sd. This splitter should split
+;; late in the pass sequence (after register rename pass),
+;; so allocated registers won't change anymore.
-(define_peephole2
- [(set (match_operand:DF 0 "register_operand")
+(define_split
+ [(set (match_operand:DF 0 "sse_reg_operand")
(float_extend:DF
(match_operand:SF 1 "nonimmediate_operand")))]
- "TARGET_SSE2 && TARGET_SSE_MATH
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ "TARGET_SSE_PARTIAL_REG_DEPENDENCY
&& optimize_function_for_speed_p (cfun)
- && SSE_REG_P (operands[0])
+ && epilogue_completed
&& (!SSE_REG_P (operands[1])
|| REGNO (operands[0]) != REGNO (operands[1]))
&& (!EXT_REX_SSE_REG_P (operands[0])