From: Michael Meissner <meissner@linux.vnet.ibm.com>
Date: Thu, 5 Jan 2017 00:43:53 +0000 (+0000)
Subject: re PR target/71977 (powerpc64: Use VSR when operating on float and integer)
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=fba4b86109078adda686acc0102a923de709146d;p=gcc.git

re PR target/71977 (powerpc64: Use VSR when operating on float and integer)

[gcc]
2017-01-04  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/71977
	PR target/70568
	PR target/78823
	* config/rs6000/predicates.md (sf_subreg_operand): New predicate.
	(altivec_register_operand): Do not return true if the operand
	contains a SUBREG mixing SImode and SFmode.
	(vsx_register_operand): Likewise.
	(vsx_reg_sfsubreg_ok): New predicate.
	(vfloat_operand): Do not return true if the operand contains a
	SUBREG mixing SImode and SFmode.
	(vint_operand): Likewise.
	(vlogical_operand): Likewise.
	(gpc_reg_operand): Likewise.
	(int_reg_operand): Likewise.
	* config/rs6000/rs6000-protos.h (valid_sf_si_move): Add
	declaration.
	* config/rs6000/rs6000.c (valid_sf_si_move): New function to
	determine if a MOVSI or MOVSF operation contains SUBREGs that mix
	SImode and SFmode.
	(rs6000_emit_move_si_sf_subreg): New helper function.
	(rs6000_emit_move): Call rs6000_emit_move_si_sf_subreg to possbily
	fixup SUBREGs involving SImode and SFmode.
	* config/rs6000/vsx.md (SFBOOL_*): New constants that are operand
	numbers for the new peephole2 optimization.
	(peephole2 for SFmode unions): New peephole2 to optimize cases in
	the GLIBC math library that do AND/IOR/XOR operations on single
	precision floating point.
	* config/rs6000/rs6000.h (TARGET_NO_SF_SUBREG): New internal
	target macros to say whether we need to avoid SUBREGs mixing
	SImode and SFmode.
	(TARGET_ALLOW_SF_SUBREG): Likewise.
	* config/rs6000/rs6000.md (UNSPEC_SF_FROM_SI): New unspecs.
	(UNSPEC_SI_FROM_SF): Likewise.
	(iorxor): Change spacing.
	(and_ior_xor): New iterator for AND, IOR, and XOR.
	(movsi_from_sf): New insns for SImode/SFmode SUBREG support.
	(movdi_from_sf_zero_ext): Likewise.
	(mov<mode>_hardfloat, FMOVE32 iterator): Use register_operand
	instead of gpc_reg_operand.  Add SImode/SFmode SUBREG support.
	(movsf_from_si): New insn for SImode/SFmode SUBREG support.
	(fma<mode>4): Use gpc_reg_operand instead of register_operand.
	(fms<mode>4): Likewise.
	(fnma<mode>4): Likewise.
	(fnms<mode>4): Likewise.
	(nfma<mode>4): Likewise.
	(nfms<mode>4): Likewise.

[gcc/testsuite]
2017-01-04  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/71977
	PR target/70568
	PR target/78823
	* gcc.target/powerpc/pr71977-1.c: New tests to check whether on
	64-bit VSX systems with direct move, whether we optimize common
	code sequences in the GLIBC math library for float math functions.
	* gcc.target/powerpc/pr71977-2.c: Likewise.

From-SVN: r244084
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 45efbb47c86..e0a57f12768 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,52 @@
+2017-01-04  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/71977
+	PR target/70568
+	PR target/78823
+	* config/rs6000/predicates.md (sf_subreg_operand): New predicate.
+	(altivec_register_operand): Do not return true if the operand
+	contains a SUBREG mixing SImode and SFmode.
+	(vsx_register_operand): Likewise.
+	(vsx_reg_sfsubreg_ok): New predicate.
+	(vfloat_operand): Do not return true if the operand contains a
+	SUBREG mixing SImode and SFmode.
+	(vint_operand): Likewise.
+	(vlogical_operand): Likewise.
+	(gpc_reg_operand): Likewise.
+	(int_reg_operand): Likewise.
+	* config/rs6000/rs6000-protos.h (valid_sf_si_move): Add
+	declaration.
+	* config/rs6000/rs6000.c (valid_sf_si_move): New function to
+	determine if a MOVSI or MOVSF operation contains SUBREGs that mix
+	SImode and SFmode.
+	(rs6000_emit_move_si_sf_subreg): New helper function.
+	(rs6000_emit_move): Call rs6000_emit_move_si_sf_subreg to possbily
+	fixup SUBREGs involving SImode and SFmode.
+	* config/rs6000/vsx.md (SFBOOL_*): New constants that are operand
+	numbers for the new peephole2 optimization.
+	(peephole2 for SFmode unions): New peephole2 to optimize cases in
+	the GLIBC math library that do AND/IOR/XOR operations on single
+	precision floating point.
+	* config/rs6000/rs6000.h (TARGET_NO_SF_SUBREG): New internal
+	target macros to say whether we need to avoid SUBREGs mixing
+	SImode and SFmode.
+	(TARGET_ALLOW_SF_SUBREG): Likewise.
+	* config/rs6000/rs6000.md (UNSPEC_SF_FROM_SI): New unspecs.
+	(UNSPEC_SI_FROM_SF): Likewise.
+	(iorxor): Change spacing.
+	(and_ior_xor): New iterator for AND, IOR, and XOR.
+	(movsi_from_sf): New insns for SImode/SFmode SUBREG support.
+	(movdi_from_sf_zero_ext): Likewise.
+	(mov<mode>_hardfloat, FMOVE32 iterator): Use register_operand
+	instead of gpc_reg_operand.  Add SImode/SFmode SUBREG support.
+	(movsf_from_si): New insn for SImode/SFmode SUBREG support.
+	(fma<mode>4): Use gpc_reg_operand instead of register_operand.
+	(fms<mode>4): Likewise.
+	(fnma<mode>4): Likewise.
+	(fnms<mode>4): Likewise.
+	(nfma<mode>4): Likewise.
+	(nfms<mode>4): Likewise.
+
 2017-01-04  Marek Polacek  <polacek@redhat.com>
 
 	PR c++/64767
diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index f79982f9029..3d69fc86dee 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -31,12 +31,47 @@
        (match_test "REGNO (op) == CTR_REGNO
 		    || REGNO (op) > LAST_VIRTUAL_REGISTER")))
 
+;; Return 1 if op is a SUBREG that is used to look at a SFmode value as
+;; and integer or vice versa.
+;;
+;; In the normal case where SFmode is in a floating point/vector register, it
+;; is stored as a DFmode and has a different format.  If we don't transform the
+;; value, things that use logical operations on the values will get the wrong
+;; value.
+;;
+;; If we don't have 64-bit and direct move, this conversion will be done by
+;; store and load, instead of by fiddling with the bits within the register.
+(define_predicate "sf_subreg_operand"
+  (match_code "subreg")
+{
+  rtx inner_reg = SUBREG_REG (op);
+  machine_mode inner_mode = GET_MODE (inner_reg);
+
+  if (TARGET_ALLOW_SF_SUBREG || !REG_P (inner_reg))
+    return 0;
+
+  if ((mode == SFmode && GET_MODE_CLASS (inner_mode) == MODE_INT)
+       || (GET_MODE_CLASS (mode) == MODE_INT && inner_mode == SFmode))
+    {
+      if (INT_REGNO_P (REGNO (inner_reg)))
+	return 0;
+
+      return 1;
+    }
+  return 0;
+})
+
 ;; Return 1 if op is an Altivec register.
 (define_predicate "altivec_register_operand"
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -50,6 +85,27 @@
 ;; Return 1 if op is a VSX register.
 (define_predicate "vsx_register_operand"
   (match_operand 0 "register_operand")
+{
+  if (GET_CODE (op) == SUBREG)
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
+
+  if (!REG_P (op))
+    return 0;
+
+  if (REGNO (op) >= FIRST_PSEUDO_REGISTER)
+    return 1;
+
+  return VSX_REGNO_P (REGNO (op));
+})
+
+;; Like vsx_register_operand, but allow SF SUBREGS
+(define_predicate "vsx_reg_sfsubreg_ok"
+  (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
     op = SUBREG_REG (op);
@@ -69,7 +125,12 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -86,7 +147,12 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -103,7 +169,13 @@
   (match_operand 0 "register_operand")
 {
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
+
 
   if (!REG_P (op))
     return 0;
@@ -221,6 +293,9 @@
        (match_test "IN_RANGE (INTVAL (op), 0, 15)")))
 
 ;; Return 1 if op is a register that is not special.
+;; Disallow (SUBREG:SF (REG:SI)) and (SUBREG:SI (REG:SF)) on VSX systems where
+;; you need to be careful in moving a SFmode to SImode and vice versa due to
+;; the fact that SFmode is represented as DFmode in the VSX registers.
 (define_predicate "gpc_reg_operand"
   (match_operand 0 "register_operand")
 {
@@ -228,7 +303,12 @@
     return 0;
 
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -246,7 +326,8 @@
 })
 
 ;; Return 1 if op is a general purpose register.  Unlike gpc_reg_operand, don't
-;; allow floating point or vector registers.
+;; allow floating point or vector registers.  Since vector registers are not
+;; allowed, we don't have to reject SFmode/SImode subregs.
 (define_predicate "int_reg_operand"
   (match_operand 0 "register_operand")
 {
@@ -254,7 +335,12 @@
     return 0;
 
   if (GET_CODE (op) == SUBREG)
-    op = SUBREG_REG (op);
+    {
+      if (TARGET_NO_SF_SUBREG && sf_subreg_operand (op, mode))
+	return 0;
+
+      op = SUBREG_REG (op);
+    }
 
   if (!REG_P (op))
     return 0;
@@ -266,6 +352,8 @@
 })
 
 ;; Like int_reg_operand, but don't return true for pseudo registers
+;; We don't have to check for SF SUBREGS because pseudo registers
+;; are not allowed, and SF SUBREGs are ok within GPR registers.
 (define_predicate "int_reg_operand_not_pseudo"
   (match_operand 0 "register_operand")
 {
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 92e75a05376..c20d3b5271e 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -153,6 +153,7 @@ extern void rs6000_fatal_bad_address (rtx);
 extern rtx create_TOC_reference (rtx, rtx);
 extern void rs6000_split_multireg_move (rtx, rtx);
 extern void rs6000_emit_le_vsx_move (rtx, rtx, machine_mode);
+extern bool valid_sf_si_move (rtx, rtx, machine_mode);
 extern void rs6000_emit_move (rtx, rtx, machine_mode);
 extern rtx rs6000_secondary_memory_needed_rtx (machine_mode);
 extern machine_mode rs6000_secondary_memory_needed_mode (machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 875015ce10e..fe858738d38 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -10402,6 +10402,78 @@ rs6000_emit_le_vsx_move (rtx dest, rtx source, machine_mode mode)
     }
 }
 
+/* Return whether a SFmode or SImode move can be done without converting one
+   mode to another.  This arrises when we have:
+
+	(SUBREG:SF (REG:SI ...))
+	(SUBREG:SI (REG:SF ...))
+
+   and one of the values is in a floating point/vector register, where SFmode
+   scalars are stored in DFmode format.  */
+
+bool
+valid_sf_si_move (rtx dest, rtx src, machine_mode mode)
+{
+  if (TARGET_ALLOW_SF_SUBREG)
+    return true;
+
+  if (mode != SFmode && GET_MODE_CLASS (mode) != MODE_INT)
+    return true;
+
+  if (!SUBREG_P (src) || !sf_subreg_operand (src, mode))
+    return true;
+
+  /*.  Allow (set (SUBREG:SI (REG:SF)) (SUBREG:SI (REG:SF))).  */
+  if (SUBREG_P (dest))
+    {
+      rtx dest_subreg = SUBREG_REG (dest);
+      rtx src_subreg = SUBREG_REG (src);
+      return GET_MODE (dest_subreg) == GET_MODE (src_subreg);
+    }
+
+  return false;
+}
+
+
+/* Helper function to change moves with:
+
+	(SUBREG:SF (REG:SI)) and
+	(SUBREG:SI (REG:SF))
+
+   into separate UNSPEC insns.  In the PowerPC architecture, scalar SFmode
+   values are stored as DFmode values in the VSX registers.  We need to convert
+   the bits before we can use a direct move or operate on the bits in the
+   vector register as an integer type.
+
+   Skip things like (set (SUBREG:SI (...) (SUBREG:SI (...)).  */
+
+static bool
+rs6000_emit_move_si_sf_subreg (rtx dest, rtx source, machine_mode mode)
+{
+  if (TARGET_DIRECT_MOVE_64BIT && !reload_in_progress && !reload_completed
+      && !lra_in_progress
+      && (!SUBREG_P (dest) || !sf_subreg_operand (dest, mode))
+      && SUBREG_P (source) && sf_subreg_operand (source, mode))
+    {
+      rtx inner_source = SUBREG_REG (source);
+      machine_mode inner_mode = GET_MODE (inner_source);
+
+      if (mode == SImode && inner_mode == SFmode)
+	{
+	  emit_insn (gen_movsi_from_sf (dest, inner_source));
+	  return true;
+	}
+
+      if (mode == SFmode && inner_mode == SImode)
+	{
+	  emit_insn (gen_movsf_from_si (dest, inner_source));
+	  return true;
+	}
+    }
+
+  return false;
+}
+
 /* Emit a move from SOURCE to DEST in mode MODE.  */
 void
 rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
@@ -10432,6 +10504,11 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode mode)
       gcc_unreachable ();
     }
 
+  /* See if we need to special case SImode/SFmode SUBREG moves.  */
+  if ((mode == SImode || mode == SFmode) && SUBREG_P (source)
+      && rs6000_emit_move_si_sf_subreg (dest, source, mode))
+    return;
+
   /* Check if GCC is setting up a block move that will end up using FP
      registers as temporaries.  We must make sure this is acceptable.  */
   if (GET_CODE (operands[0]) == MEM
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index 4003730d9bd..0180e0c3dfd 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -608,6 +608,12 @@ extern int rs6000_vector_align[];
 				 && TARGET_POWERPC64)
 #define TARGET_VEXTRACTUB	(TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
 				 && TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
+
+
+/* Whether we should avoid (SUBREG:SI (REG:SF) and (SUBREG:SF (REG:SI).  */
+#define TARGET_NO_SF_SUBREG	TARGET_DIRECT_MOVE_64BIT
+#define TARGET_ALLOW_SF_SUBREG	(!TARGET_DIRECT_MOVE_64BIT)
+
 /* This wants to be set for p8 and newer.  On p7, overlapping unaligned
    loads are slow. */
 #define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 7e103b019f0..f7c1ab26a99 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -150,6 +150,8 @@
    UNSPEC_IEEE128_CONVERT
    UNSPEC_SIGNBIT
    UNSPEC_DOLOOP
+   UNSPEC_SF_FROM_SI
+   UNSPEC_SI_FROM_SF
   ])
 
 ;;
@@ -561,7 +563,8 @@
 (define_code_attr return_str [(return "") (simple_return "simple_")])
 
 ; Logical operators.
-(define_code_iterator iorxor [ior xor])
+(define_code_iterator iorxor		[ior xor])
+(define_code_iterator and_ior_xor	[and ior xor])
 
 ; Signed/unsigned variants of ops.
 (define_code_iterator any_extend	[sign_extend zero_extend])
@@ -6768,6 +6771,157 @@
   [(set_attr "type" "*,*,load,store,*,*,*,mfjmpr,mtjmpr,*,*,fpstore,fpload")
    (set_attr "length" "4,4,4,4,4,4,8,4,4,4,4,4,4")])
 
+;; Like movsi, but adjust a SF value to be used in a SI context, i.e.
+;; (set (reg:SI ...) (subreg:SI (reg:SF ...) 0))
+;;
+;; Because SF values are actually stored as DF values within the vector
+;; registers, we need to convert the value to the vector SF format when
+;; we need to use the bits in a union or similar cases.  We only need
+;; to do this transformation when the value is a vector register.  Loads,
+;; stores, and transfers within GPRs are assumed to be safe.
+;;
+;; This is a more general case of reload_gpr_from_vsxsf.  That insn must have
+;; no alternatives, because the call is created as part of secondary_reload,
+;; and operand #2's register class is used to allocate the temporary register.
+;; This function is called before reload, and it creates the temporary as
+;; needed.
+
+;;		MR           LWZ          LFIWZX       LXSIWZX   STW
+;;		STFS         STXSSP       STXSSPX      VSX->GPR  MTVSRWZ
+;;		VSX->VSX
+
+(define_insn_and_split "movsi_from_sf"
+  [(set (match_operand:SI 0 "rs6000_nonimmediate_operand"
+		"=r,         r,           ?*wI,        ?*wH,     m,
+		 m,          wY,          Z,           r,        wIwH,
+		 ?wK")
+
+	(unspec:SI [(match_operand:SF 1 "input_operand"
+		"r,          m,           Z,           Z,        r,
+		 f,          wu,          wu,          wIwH,     r,
+		 wK")]
+		    UNSPEC_SI_FROM_SF))
+
+   (clobber (match_scratch:V4SF 2
+		"=X,         X,           X,           X,        X,
+		 X,          X,           X,           wa,       X,
+		 wa"))]
+
+  "TARGET_NO_SF_SUBREG
+   && (register_operand (operands[0], SImode)
+       || register_operand (operands[1], SFmode))"
+  "@
+   mr %0,%1
+   lwz%U1%X1 %0,%1
+   lfiwzx %0,%y1
+   lxsiwzx %x0,%y1
+   stw%U0%X0 %1,%0
+   stfs%U0%X0 %1,%0
+   stxssp %1,%0
+   stxsspx %x1,%y0
+   #
+   mtvsrwz %x0,%1
+   #"
+  "&& reload_completed
+   && register_operand (operands[0], SImode)
+   && vsx_reg_sfsubreg_ok (operands[1], SFmode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op0_di = gen_rtx_REG (DImode, REGNO (op0));
+
+  emit_insn (gen_vsx_xscvdpspn_scalar (op2, op1));
+
+  if (int_reg_operand (op0, SImode))
+    {
+      emit_insn (gen_p8_mfvsrd_4_disf (op0_di, op2));
+      emit_insn (gen_lshrdi3 (op0_di, op0_di, GEN_INT (32)));
+    }
+  else
+    {
+      rtx op1_v16qi = gen_rtx_REG (V16QImode, REGNO (op1));
+      rtx byte_off = VECTOR_ELT_ORDER_BIG ? const0_rtx : GEN_INT (12);
+      emit_insn (gen_vextract4b (op0_di, op1_v16qi, byte_off));
+    }
+
+  DONE;
+}
+  [(set_attr "type"
+		"*,          load,        fpload,      fpload,   store,
+		 fpstore,    fpstore,     fpstore,     mftgpr,   mffgpr,
+		 veclogical")
+
+   (set_attr "length"
+		"4,          4,           4,           4,        4,
+		 4,          4,           4,           12,       4,
+		 8")])
+
+;; movsi_from_sf with zero extension
+;;
+;;		RLDICL       LWZ          LFIWZX       LXSIWZX   VSX->GPR
+;;		MTVSRWZ      VSX->VSX
+
+(define_insn_and_split "*movdi_from_sf_zero_ext"
+  [(set (match_operand:DI 0 "gpc_reg_operand"
+		"=r,         r,           ?*wI,        ?*wH,     r,
+		wIwH,        ?wK")
+
+	(zero_extend:DI
+	 (unspec:SI [(match_operand:SF 1 "input_operand"
+		"r,          m,           Z,           Z,        wIwH,
+		 r,          wK")]
+		    UNSPEC_SI_FROM_SF)))
+
+   (clobber (match_scratch:V4SF 2
+		"=X,         X,           X,           X,        wa,
+		 X,          wa"))]
+
+  "TARGET_DIRECT_MOVE_64BIT
+   && (register_operand (operands[0], DImode)
+       || register_operand (operands[1], SImode))"
+  "@
+   rldicl %0,%1,0,32
+   lwz%U1%X1 %0,%1
+   lfiwzx %0,%y1
+   lxsiwzx %x0,%y1
+   #
+   mtvsrwz %x0,%1
+   #"
+  "&& reload_completed
+   && vsx_reg_sfsubreg_ok (operands[1], SFmode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+
+  emit_insn (gen_vsx_xscvdpspn_scalar (op2, op1));
+
+  if (int_reg_operand (op0, DImode))
+    {
+      emit_insn (gen_p8_mfvsrd_4_disf (op0, op2));
+      emit_insn (gen_lshrdi3 (op0, op0, GEN_INT (32)));
+    }
+  else
+    {
+      rtx op0_si = gen_rtx_REG (SImode, REGNO (op0));
+      rtx op1_v16qi = gen_rtx_REG (V16QImode, REGNO (op1));
+      rtx byte_off = VECTOR_ELT_ORDER_BIG ? const0_rtx : GEN_INT (12);
+      emit_insn (gen_vextract4b (op0_si, op1_v16qi, byte_off));
+    }
+
+  DONE;
+}
+  [(set_attr "type"
+		"*,          load,        fpload,      fpload,  mftgpr,
+		 mffgpr,     veclogical")
+
+   (set_attr "length"
+		"4,          4,           4,           4,        12,
+		 4,          8")])
+
 ;; Split a load of a large constant into the appropriate two-insn
 ;; sequence.
 
@@ -6977,9 +7131,11 @@
 	 "m,         <f32_lm>,  <f32_lm2>, Z,         r,         <f32_sr>,
 	  <f32_sr2>, <f32_av>,  <zero_fp>, <zero_fp>, r,         <f32_dm>,
 	  f,         <f32_vsx>, r,         r,         *h,        0"))]
-  "(gpc_reg_operand (operands[0], <MODE>mode)
-   || gpc_reg_operand (operands[1], <MODE>mode))
-   && (TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT)"
+  "(register_operand (operands[0], <MODE>mode)
+   || register_operand (operands[1], <MODE>mode))
+   && TARGET_HARD_FLOAT && TARGET_FPRS && TARGET_SINGLE_FLOAT
+   && (TARGET_ALLOW_SF_SUBREG
+       || valid_sf_si_move (operands[0], operands[1], <MODE>mode))"
   "@
    lwz%U1%X1 %0,%1
    <f32_li>
@@ -7021,6 +7177,75 @@
   [(set_attr "type" "*,mtjmpr,mfjmpr,load,store,*,*,*,*,*")
    (set_attr "length" "4,4,4,4,4,4,4,4,8,4")])
 
+;; Like movsf, but adjust a SI value to be used in a SF context, i.e.
+;; (set (reg:SF ...) (subreg:SF (reg:SI ...) 0))
+;;
+;; Because SF values are actually stored as DF values within the vector
+;; registers, we need to convert the value to the vector SF format when
+;; we need to use the bits in a union or similar cases.  We only need
+;; to do this transformation when the value is a vector register.  Loads,
+;; stores, and transfers within GPRs are assumed to be safe.
+;;
+;; This is a more general case of reload_vsx_from_gprsf.  That insn must have
+;; no alternatives, because the call is created as part of secondary_reload,
+;; and operand #2's register class is used to allocate the temporary register.
+;; This function is called before reload, and it creates the temporary as
+;; needed.
+
+;;	    LWZ          LFS        LXSSP      LXSSPX     STW        STFIWX
+;;	    STXSIWX      GPR->VSX   VSX->GPR   GPR->GPR
+(define_insn_and_split "movsf_from_si"
+  [(set (match_operand:SF 0 "rs6000_nonimmediate_operand"
+	    "=!r,       f,         wb,        wu,        m,         Z,
+	     Z,         wy,        ?r,        !r")
+
+	(unspec:SF [(match_operand:SI 1 "input_operand" 
+	    "m,         m,         wY,        Z,         r,         f,
+	     wu,        r,         wy,        r")]
+		   UNSPEC_SF_FROM_SI))
+
+   (clobber (match_scratch:DI 2
+	    "=X,        X,         X,         X,         X,         X,
+             X,         r,         X,         X"))]
+
+  "TARGET_NO_SF_SUBREG
+   && (register_operand (operands[0], SFmode)
+       || register_operand (operands[1], SImode))"
+  "@
+   lwz%U1%X1 %0,%1
+   lfs%U1%X1 %0,%1
+   lxssp %0,%1
+   lxsspx %x0,%y1
+   stw%U0%X0 %1,%0
+   stfiwx %1,%y0
+   stxsiwx %x1,%y0
+   #
+   mfvsrwz %0,%x1
+   mr %0,%1"
+
+  "&& reload_completed
+   && vsx_reg_sfsubreg_ok (operands[0], SFmode)
+   && int_reg_operand_not_pseudo (operands[1], SImode)"
+  [(const_int 0)]
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
+
+  /* Move SF value to upper 32-bits for xscvspdpn.  */
+  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
+  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+  DONE;
+}
+  [(set_attr "length"
+	    "4,          4,         4,         4,         4,         4,
+	     4,          12,        4,         4")
+   (set_attr "type"
+	    "load,       fpload,    fpload,    fpload,    store,     fpstore,
+	     fpstore,    vecfloat,  mffgpr,    *")])
+
 
 ;; Move 64-bit binary/decimal floating point
 (define_expand "mov<mode>"
@@ -13231,11 +13456,11 @@
 ;; Note that the conditions for expansion are in the FMA_F iterator.
 
 (define_expand "fma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(fma:FMA_F
-	  (match_operand:FMA_F 1 "register_operand" "")
-	  (match_operand:FMA_F 2 "register_operand" "")
-	  (match_operand:FMA_F 3 "register_operand" "")))]
+	  (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 3 "gpc_reg_operand" "")))]
   ""
   "")
 
@@ -13255,11 +13480,11 @@
 
 ; Altivec only has fma and nfms.
 (define_expand "fms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(fma:FMA_F
-	  (match_operand:FMA_F 1 "register_operand" "")
-	  (match_operand:FMA_F 2 "register_operand" "")
-	  (neg:FMA_F (match_operand:FMA_F 3 "register_operand" ""))))]
+	  (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	  (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	  (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
@@ -13279,34 +13504,34 @@
 
 ;; If signed zeros are ignored, -(a * b - c) = -a * b + c.
 (define_expand "fnma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (neg:FMA_F (match_operand:FMA_F 3 "register_operand" "")))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" "")))))]
   "!HONOR_SIGNED_ZEROS (<MODE>mode)"
   "")
 
 ;; If signed zeros are ignored, -(a * b + c) = -a * b - c.
 (define_expand "fnms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (match_operand:FMA_F 3 "register_operand" ""))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!HONOR_SIGNED_ZEROS (<MODE>mode) && !VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
 ; Not an official optab name, but used from builtins.
 (define_expand "nfma<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (match_operand:FMA_F 3 "register_operand" ""))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 3 "gpc_reg_operand" ""))))]
   "!VECTOR_UNIT_ALTIVEC_P (<MODE>mode)"
   "")
 
@@ -13327,12 +13552,12 @@
 
 ; Not an official optab name, but used from builtins.
 (define_expand "nfms<mode>4"
-  [(set (match_operand:FMA_F 0 "register_operand" "")
+  [(set (match_operand:FMA_F 0 "gpc_reg_operand" "")
 	(neg:FMA_F
 	  (fma:FMA_F
-	    (match_operand:FMA_F 1 "register_operand" "")
-	    (match_operand:FMA_F 2 "register_operand" "")
-	    (neg:FMA_F (match_operand:FMA_F 3 "register_operand" "")))))]
+	    (match_operand:FMA_F 1 "gpc_reg_operand" "")
+	    (match_operand:FMA_F 2 "gpc_reg_operand" "")
+	    (neg:FMA_F (match_operand:FMA_F 3 "gpc_reg_operand" "")))))]
   ""
   "")
 
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 6264e6c7206..e054f5c7caa 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3897,3 +3897,149 @@
   "TARGET_P9_VECTOR"
   "xxinsertw %x0,%x1,%3"
   [(set_attr "type" "vecperm")])
+
+
+
+;; Operand numbers for the following peephole2
+(define_constants
+  [(SFBOOL_TMP_GPR		 0)		;; GPR temporary
+   (SFBOOL_TMP_VSX		 1)		;; vector temporary
+   (SFBOOL_MFVSR_D		 2)		;; move to gpr dest
+   (SFBOOL_MFVSR_A		 3)		;; move to gpr src
+   (SFBOOL_BOOL_D		 4)		;; and/ior/xor dest
+   (SFBOOL_BOOL_A1		 5)		;; and/ior/xor arg1
+   (SFBOOL_BOOL_A2		 6)		;; and/ior/xor arg1
+   (SFBOOL_SHL_D		 7)		;; shift left dest
+   (SFBOOL_SHL_A		 8)		;; shift left arg
+   (SFBOOL_MTVSR_D		 9)		;; move to vecter dest
+   (SFBOOL_BOOL_A_DI		10)		;; SFBOOL_BOOL_A1/A2 as DImode
+   (SFBOOL_TMP_VSX_DI		11)		;; SFBOOL_TMP_VSX as DImode
+   (SFBOOL_MTVSR_D_V4SF		12)])		;; SFBOOL_MTVSRD_D as V4SFmode
+
+;; Attempt to optimize some common GLIBC operations using logical operations to
+;; pick apart SFmode operations.  For example, there is code from e_powf.c
+;; after macro expansion that looks like:
+;;
+;;	typedef union {
+;;	  float value;
+;;	  uint32_t word;
+;;	} ieee_float_shape_type;
+;;
+;;	float t1;
+;;	int32_t is;
+;;
+;;	do {
+;;	  ieee_float_shape_type gf_u;
+;;	  gf_u.value = (t1);
+;;	  (is) = gf_u.word;
+;;	} while (0);
+;;
+;;	do {
+;;	  ieee_float_shape_type sf_u;
+;;	  sf_u.word = (is & 0xfffff000);
+;;	  (t1) = sf_u.value;
+;;	} while (0);
+;;
+;;
+;; This would result in two direct move operations (convert to memory format,
+;; direct move to GPR, do the AND operation, direct move to VSX, convert to
+;; scalar format).  With this peephole, we eliminate the direct move to the
+;; GPR, and instead move the integer mask value to the vector register after a
+;; shift and do the VSX logical operation.
+
+;; The insns for dealing with SFmode in GPR registers looks like:
+;; (set (reg:V4SF reg2) (unspec:V4SF [(reg:SF reg1)] UNSPEC_VSX_CVDPSPN))
+;;
+;; (set (reg:DI reg3) (unspec:DI [(reg:V4SF reg2)] UNSPEC_P8V_RELOAD_FROM_VSX))
+;;
+;; (set (reg:DI reg3) (lshiftrt:DI (reg:DI reg3) (const_int 32)))
+;;
+;; (set (reg:DI reg5) (and:DI (reg:DI reg3) (reg:DI reg4)))
+;;
+;; (set (reg:DI reg6) (ashift:DI (reg:DI reg5) (const_int 32)))
+;;
+;; (set (reg:SF reg7) (unspec:SF [(reg:DI reg6)] UNSPEC_P8V_MTVSRD))
+;;
+;; (set (reg:SF reg7) (unspec:SF [(reg:SF reg7)] UNSPEC_VSX_CVSPDPN))
+
+(define_peephole2
+  [(match_scratch:DI SFBOOL_TMP_GPR "r")
+   (match_scratch:V4SF SFBOOL_TMP_VSX "wa")
+
+   ;; MFVSRD
+   (set (match_operand:DI SFBOOL_MFVSR_D "int_reg_operand")
+	(unspec:DI [(match_operand:V4SF SFBOOL_MFVSR_A "vsx_register_operand")]
+		   UNSPEC_P8V_RELOAD_FROM_VSX))
+
+   ;; SRDI
+   (set (match_dup SFBOOL_MFVSR_D)
+	(lshiftrt:DI (match_dup SFBOOL_MFVSR_D)
+		     (const_int 32)))
+
+   ;; AND/IOR/XOR operation on int
+   (set (match_operand:SI SFBOOL_BOOL_D "int_reg_operand")
+	(and_ior_xor:SI (match_operand:SI SFBOOL_BOOL_A1 "int_reg_operand")
+			(match_operand:SI SFBOOL_BOOL_A2 "reg_or_cint_operand")))
+
+   ;; SLDI
+   (set (match_operand:DI SFBOOL_SHL_D "int_reg_operand")
+	(ashift:DI (match_operand:DI SFBOOL_SHL_A "int_reg_operand")
+		   (const_int 32)))
+
+   ;; MTVSRD
+   (set (match_operand:SF SFBOOL_MTVSR_D "vsx_register_operand")
+	(unspec:SF [(match_dup SFBOOL_SHL_D)] UNSPEC_P8V_MTVSRD))]
+
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE
+   /* The REG_P (xxx) tests prevents SUBREG's, which allows us to use REGNO
+      to compare registers, when the mode is different.  */
+   && REG_P (operands[SFBOOL_MFVSR_D]) && REG_P (operands[SFBOOL_BOOL_D])
+   && REG_P (operands[SFBOOL_BOOL_A1]) && REG_P (operands[SFBOOL_SHL_D])
+   && REG_P (operands[SFBOOL_SHL_A])   && REG_P (operands[SFBOOL_MTVSR_D])
+   && (REG_P (operands[SFBOOL_BOOL_A2])
+       || CONST_INT_P (operands[SFBOOL_BOOL_A2]))
+   && (REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_MFVSR_D])
+       || peep2_reg_dead_p (3, operands[SFBOOL_MFVSR_D]))
+   && (REGNO (operands[SFBOOL_MFVSR_D]) == REGNO (operands[SFBOOL_BOOL_A1])
+       || (REG_P (operands[SFBOOL_BOOL_A2])
+	   && REGNO (operands[SFBOOL_MFVSR_D])
+		== REGNO (operands[SFBOOL_BOOL_A2])))
+   && REGNO (operands[SFBOOL_BOOL_D]) == REGNO (operands[SFBOOL_SHL_A])
+   && (REGNO (operands[SFBOOL_SHL_D]) == REGNO (operands[SFBOOL_BOOL_D])
+       || peep2_reg_dead_p (4, operands[SFBOOL_BOOL_D]))
+   && peep2_reg_dead_p (5, operands[SFBOOL_SHL_D])"
+  [(set (match_dup SFBOOL_TMP_GPR)
+	(ashift:DI (match_dup SFBOOL_BOOL_A_DI)
+		   (const_int 32)))
+
+   (set (match_dup SFBOOL_TMP_VSX_DI)
+	(match_dup SFBOOL_TMP_GPR))
+
+   (set (match_dup SFBOOL_MTVSR_D_V4SF)
+	(and_ior_xor:V4SF (match_dup SFBOOL_MFVSR_A)
+			  (match_dup SFBOOL_TMP_VSX)))]
+{
+  rtx bool_a1 = operands[SFBOOL_BOOL_A1];
+  rtx bool_a2 = operands[SFBOOL_BOOL_A2];
+  int regno_mfvsr_d = REGNO (operands[SFBOOL_MFVSR_D]);
+  int regno_tmp_vsx = REGNO (operands[SFBOOL_TMP_VSX]);
+  int regno_mtvsr_d = REGNO (operands[SFBOOL_MTVSR_D]);
+
+  if (CONST_INT_P (bool_a2))
+    {
+      rtx tmp_gpr = operands[SFBOOL_TMP_GPR];
+      emit_move_insn (tmp_gpr, bool_a2);
+      operands[SFBOOL_BOOL_A_DI] = tmp_gpr;
+    }
+  else
+    {
+      int regno_bool_a1 = REGNO (bool_a1);
+      int regno_bool_a2 = REGNO (bool_a2);
+      int regno_bool_a = (regno_mfvsr_d == regno_bool_a1
+			  ? regno_bool_a2 : regno_bool_a1);
+      operands[SFBOOL_BOOL_A_DI] = gen_rtx_REG (DImode, regno_bool_a);
+    }
+
+  operands[SFBOOL_TMP_VSX_DI] = gen_rtx_REG (DImode, regno_tmp_vsx);
+  operands[SFBOOL_MTVSR_D_V4SF] = gen_rtx_REG (V4SFmode, regno_mtvsr_d);
+})
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 9b44a4a5843..90a5c1099b9 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@
+2016-12-29  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+	PR target/71977
+	PR target/70568
+	PR target/78823
+	* gcc.target/powerpc/pr71977-1.c: New tests to check whether on
+	64-bit VSX systems with direct move, whether we optimize common
+	code sequences in the GLIBC math library for float math functions.
+	* gcc.target/powerpc/pr71977-2.c: Likewise.
+
 2017-01-04  Marek Polacek  <polacek@redhat.com>
 
 	PR c++/64767
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71977-1.c b/gcc/testsuite/gcc.target/powerpc/pr71977-1.c
new file mode 100644
index 00000000000..c4413b8747a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71977-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+#include <stdint.h>
+
+typedef union
+{
+  float value;
+  uint32_t word;
+} ieee_float_shape_type;
+
+float
+mask_and_float_var (float f, uint32_t mask)
+{ 
+  ieee_float_shape_type u;
+
+  u.value = f;
+  u.word &= mask;
+
+  return u.value;
+}
+
+/* { dg-final { scan-assembler     "\[ \t\]xxland " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]and "    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]mfvsrd " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stxv"    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]lxv"     } } */
+/* { dg-final { scan-assembler-not "\[ \t\]srdi "   } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71977-2.c b/gcc/testsuite/gcc.target/powerpc/pr71977-2.c
new file mode 100644
index 00000000000..8ec1b6126ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71977-2.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { powerpc*-*-* && lp64 } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O2" } */
+
+#include <stdint.h>
+
+typedef union
+{
+  float value;
+  uint32_t word;
+} ieee_float_shape_type;
+
+float
+mask_and_float_sign (float f)
+{ 
+  ieee_float_shape_type u;
+
+  u.value = f;
+  u.word &= 0x80000000;
+
+  return u.value;
+}
+
+/* { dg-final { scan-assembler     "\[ \t\]xxland " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]and "    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]mfvsrd " } } */
+/* { dg-final { scan-assembler-not "\[ \t\]stxv"    } } */
+/* { dg-final { scan-assembler-not "\[ \t\]lxv"     } } */
+/* { dg-final { scan-assembler-not "\[ \t\]srdi "   } } */