PR target/48258, improve vector reduction on power7
authorMichael Meissner <meissner@linux.vnet.ibm.com>
Tue, 26 Apr 2011 17:48:29 +0000 (17:48 +0000)
committerMichael Meissner <meissner@gcc.gnu.org>
Tue, 26 Apr 2011 17:48:29 +0000 (17:48 +0000)
From-SVN: r172981

gcc/ChangeLog
gcc/config/rs6000/rs6000.c
gcc/config/rs6000/vector.md
gcc/config/rs6000/vsx.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/powerpc/pr48258-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/powerpc/pr48258-2.c [new file with mode: 0644]

index 0749c64c384b3865906c7e6e6db50d864e6a59ad..99c029d748d6472285aeefe3cc22e620c51ed72b 100644 (file)
@@ -1,3 +1,27 @@
+2011-04-26  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+       PR target/48258
+       * config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector
+       reduction.
+       (VEC_reduc): New code iterator and splitters for vector reduction.
+       (VEC_reduc_name): Ditto.
+       (VEC_reduc_rtx): Ditto.
+       (reduc_<VEC_reduc_name>_v2df): Vector reduction expanders for VSX.
+       (reduc_<VEC_reduc_name>_v4sf): Ditto.
+
+       * config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add
+       support for extracting SF on VSX.
+
+       * config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for
+       generating xscvspdp.
+       (vsx_extract_v4sf): New insn to extract SF from V4SF vector.
+       (vsx_reduc_<VEC_reduc_name>_v2df): New insns and splitters for
+       double add, minimum, maximum vector reduction.
+       (vsx_reduc_<VEC_reduc_name>_v4sf): Ditto.
+       (vsx_reduc_<VEC_reduc_name>_v2df2_scalar): New combiner insn to
+       optimize double vector reduction.
+       (vsx_reduc_<VEC_reduc_name>_v4sf_scalar): Ditto.
+
 2011-04-26  Joseph Myers  <joseph@codesourcery.com>
 
        * config/fr30/fr30.h (inhibit_libc): Don't define.
index 6113a75b390abaadb3bba858e1ccf739fb88577f..41259630bf4f5d04af0f1f5455005df1d3c9de11 100644 (file)
@@ -5463,12 +5463,22 @@ rs6000_expand_vector_extract (rtx target, rtx vec, int elt)
   enum machine_mode inner_mode = GET_MODE_INNER (mode);
   rtx mem;
 
-  if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode))
+  if (VECTOR_MEM_VSX_P (mode))
     {
-      rtx (*extract_func) (rtx, rtx, rtx)
-       = ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di);
-      emit_insn (extract_func (target, vec, GEN_INT (elt)));
-      return;
+      switch (mode)
+       {
+       default:
+         break;
+       case V2DFmode:
+         emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt)));
+         return;
+       case V2DImode:
+         emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt)));
+         return;
+       case V4SFmode:
+         emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt)));
+         return;
+       }
     }
 
   /* Allocate mode-sized buffer.  */
index a3a8e124dd37e7ce2af6f777b8694cc8dff0ca09..c0112507a6e49723416a46021b8c239c050fc5f7 100644 (file)
                           (V2DF  "V2DI")])
 
 ;; constants for unspec
-(define_c_enum "unspec" [UNSPEC_PREDICATE])
+(define_c_enum "unspec" [UNSPEC_PREDICATE
+                        UNSPEC_REDUC])
+
+;; Vector reduction code iterators
+(define_code_iterator VEC_reduc [plus smin smax])
+
+(define_code_attr VEC_reduc_name [(plus "splus")
+                                 (smin "smin")
+                                 (smax "smax")])
+
+(define_code_attr VEC_reduc_rtx [(plus "add")
+                                (smin "smin")
+                                (smax "smax")])
 
 \f
 ;; Vector move instructions.
   "TARGET_ALTIVEC"
   "")
 \f
+;; Vector reduction expanders for VSX
+
+(define_expand "reduc_<VEC_reduc_name>_v2df"
+  [(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "")
+                  (VEC_reduc:V2DF
+                   (vec_concat:V2DF
+                    (vec_select:DF
+                     (match_operand:V2DF 1 "vfloat_operand" "")
+                     (parallel [(const_int 1)]))
+                    (vec_select:DF
+                     (match_dup 1)
+                     (parallel [(const_int 0)])))
+                   (match_dup 1)))
+             (clobber (match_scratch:V2DF 2 ""))])]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "")
+
+; The (VEC_reduc:V4SF
+;      (op1)
+;      (unspec:V4SF [(const_int 0)] UNSPEC_REDUC))
+;
+; is to allow us to use a code iterator, but not completely list all of the
+; vector rotates, etc. to prevent canonicalization
+
+(define_expand "reduc_<VEC_reduc_name>_v4sf"
+  [(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "")
+                  (VEC_reduc:V4SF
+                   (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+                   (match_operand:V4SF 1 "vfloat_operand" "")))
+             (clobber (match_scratch:V4SF 2 ""))
+             (clobber (match_scratch:V4SF 3 ""))])]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "")
+
+\f
 ;;; Expanders for vector insn patterns shared between the SPE and TARGET_PAIRED systems.
 
 (define_expand "absv2sf2"
index fc331dc27edcbc99a181b63f5a75cbe4fdbdead9..d4f529676e42b000007e0001e4d659f775a2ab5d 100644 (file)
   "xscvdpsp %x0,%x1"
   [(set_attr "type" "fp")])
 
+;; Same as vsx_xscvspdp, but use SF as the type
+(define_insn "vsx_xscvspdp_scalar2"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f")
+       (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")]
+                  UNSPEC_VSX_CVSPDP))]
+  "VECTOR_UNIT_VSX_P (DFmode)"
+  "xscvspdp %x0,%x1"
+  [(set_attr "type" "fp")])
+
 ;; Convert from 64-bit to 32-bit types
 ;; Note, favor the Altivec registers since the usual use of these instructions
 ;; is in vector converts and we need to use the Altivec vperm instruction.
   [(set_attr "type" "fpload")
    (set_attr "length" "4")])  
 
+;; Extract a SF element from V4SF
+(define_insn_and_split "vsx_extract_v4sf"
+  [(set (match_operand:SF 0 "vsx_register_operand" "=f,f")
+       (vec_select:SF
+        (match_operand:V4SF 1 "vsx_register_operand" "wa,wa")
+        (parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")])))
+   (clobber (match_scratch:V4SF 3 "=X,0"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "@
+   xscvspdp %x0,%x1
+   #"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx op2 = operands[2];
+  rtx op3 = operands[3];
+  rtx tmp;
+  HOST_WIDE_INT ele = INTVAL (op2);
+
+  if (ele == 0)
+    tmp = op1;
+  else
+    {
+      if (GET_CODE (op3) == SCRATCH)
+       op3 = gen_reg_rtx (V4SFmode);
+      emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2));
+      tmp = op3;
+    }
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp));
+  DONE;
+}"
+  [(set_attr "length" "4,8")
+   (set_attr "type" "fp")])
+
 ;; General double word oriented permute, allow the other vector types for
 ;; optimizing the permute instruction.
 (define_insn "vsx_xxpermdi_<mode>"
   "VECTOR_MEM_VSX_P (<MODE>mode)"
   "xxsldwi %x0,%x1,%x2,%3"
   [(set_attr "type" "vecperm")])
+
+\f
+;; Vector reduction insns and splitters
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df"
+  [(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa")
+       (VEC_reduc:V2DF
+        (vec_concat:V2DF
+         (vec_select:DF
+          (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+          (parallel [(const_int 1)]))
+         (vec_select:DF
+          (match_dup 1)
+          (parallel [(const_int 0)])))
+        (match_dup 1)))
+   (clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx tmp = (GET_CODE (operands[2]) == SCRATCH)
+            ? gen_reg_rtx (V2DFmode)
+            : operands[2];
+  emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v2df3 (operands[0], tmp, operands[1]));
+  DONE;
+}"
+  [(set_attr "length" "8")
+   (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf"
+  [(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa")
+       (VEC_reduc:V4SF
+        (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+        (match_operand:V4SF 1 "vfloat_operand" "wf,wa")))
+   (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 3 "=&wf,&wa"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp2, tmp3, tmp4;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp2 = gen_reg_rtx (V4SFmode);
+      tmp3 = gen_reg_rtx (V4SFmode);
+      tmp4 = gen_reg_rtx (V4SFmode);
+    }
+  else
+    {
+      tmp2 = operands[2];
+      tmp3 = operands[3];
+      tmp4 = tmp2;
+    }
+
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (op0, tmp4, tmp3));
+  DONE;
+}"
+  [(set_attr "length" "16")
+   (set_attr "type" "veccomplex")])
+
+;; Combiner patterns with the vector reduction patterns that knows we can get
+;; to the top element of the V2DF array without doing an extract.
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v2df_scalar"
+  [(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa")
+       (vec_select:DF
+        (VEC_reduc:V2DF
+         (vec_concat:V2DF
+          (vec_select:DF
+           (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa")
+           (parallel [(const_int 1)]))
+          (vec_select:DF
+           (match_dup 1)
+           (parallel [(const_int 0)])))
+         (match_dup 1))
+        (parallel [(const_int 1)])))
+   (clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))]
+  "VECTOR_UNIT_VSX_P (V2DFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx hi = gen_highpart (DFmode, operands[1]);
+  rtx lo = (GET_CODE (operands[2]) == SCRATCH)
+           ? gen_reg_rtx (DFmode)
+           : operands[2];
+
+  emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>df3 (operands[0], hi, lo));
+  DONE;
+}"
+  [(set_attr "length" "8")
+   (set_attr "type" "veccomplex")])
+
+(define_insn_and_split "*vsx_reduc_<VEC_reduc_name>_v4sf_scalar"
+  [(set (match_operand:SF 0 "vfloat_operand" "=f,?f")
+       (vec_select:SF
+        (VEC_reduc:V4SF
+         (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)
+         (match_operand:V4SF 1 "vfloat_operand" "wf,wa"))
+        (parallel [(const_int 3)])))
+   (clobber (match_scratch:V4SF 2 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 3 "=&wf,&wa"))
+   (clobber (match_scratch:V4SF 4 "=0,0"))]
+  "VECTOR_UNIT_VSX_P (V4SFmode)"
+  "#"
+  ""
+  [(const_int 0)]
+  "
+{
+  rtx op0 = operands[0];
+  rtx op1 = operands[1];
+  rtx tmp2, tmp3, tmp4, tmp5;
+
+  if (can_create_pseudo_p ())
+    {
+      tmp2 = gen_reg_rtx (V4SFmode);
+      tmp3 = gen_reg_rtx (V4SFmode);
+      tmp4 = gen_reg_rtx (V4SFmode);
+      tmp5 = gen_reg_rtx (V4SFmode);
+    }
+  else
+    {
+      tmp2 = operands[2];
+      tmp3 = operands[3];
+      tmp4 = tmp2;
+      tmp5 = operands[4];
+    }
+
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp3, tmp2, op1));
+  emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3)));
+  emit_insn (gen_<VEC_reduc_rtx>v4sf3 (tmp5, tmp4, tmp3));
+  emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5));
+  DONE;
+}"
+  [(set_attr "length" "20")
+   (set_attr "type" "veccomplex")])
index 79cb509fb08b0317ba901f82679923027d9b3a97..f6533cae2bb13d11119fd7a0a2bf544468e80851 100644 (file)
@@ -1,5 +1,11 @@
+2011-03-23  Michael Meissner  <meissner@linux.vnet.ibm.com>
+
+       PR target/48258
+       * gcc.target/powerpc/pr48258-1.c: New file.
+       * gcc.target/powerpc/pr48258-2.c: Ditto.
+
 2011-04-26  Xinliang David Li  <davidxl@google.com>
-       
+
        * gcc.dg/uninit-suppress.c: New test.
        * gcc.dg/uninit-suppress.c: New test.
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr48258-1.c b/gcc/testsuite/gcc.target/powerpc/pr48258-1.c
new file mode 100644 (file)
index 0000000..4f37815
--- /dev/null
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvaddsp" 3 } } */
+/* { dg-final { scan-assembler-times "xvminsp" 3 } } */
+/* { dg-final { scan-assembler-times "xvmaxsp" 3 } } */
+/* { dg-final { scan-assembler-times "xxsldwi" 6 } } */
+/* { dg-final { scan-assembler-times "xscvspdp" 3 } } */
+/* { dg-final { scan-assembler-not "stvewx" } } */
+/* { dg-final { scan-assembler-not "stvx" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvw4x" } } */
+
+#include <stddef.h>
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+float values[SIZE] __attribute__((__aligned__(32)));
+
+float
+vector_sum (void)
+{
+  size_t i;
+  float sum = 0.0f;
+
+  for (i = 0; i < SIZE; i++)
+    sum += values[i];
+
+  return sum;
+}
+
+float
+vector_min (void)
+{
+  size_t i;
+  float min = values[0];
+
+  for (i = 0; i < SIZE; i++)
+    min = __builtin_fminf (min, values[i]);
+
+  return min;
+}
+
+float
+vector_max (void)
+{
+  size_t i;
+  float max = values[0];
+
+  for (i = 0; i < SIZE; i++)
+    max = __builtin_fmaxf (max, values[i]);
+
+  return max;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr48258-2.c b/gcc/testsuite/gcc.target/powerpc/pr48258-2.c
new file mode 100644 (file)
index 0000000..443fb62
--- /dev/null
@@ -0,0 +1,58 @@
+/* { dg-do compile } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */
+/* { dg-final { scan-assembler-times "xvadddp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmindp" 1 } } */
+/* { dg-final { scan-assembler-times "xvmaxdp" 1 } } */
+/* { dg-final { scan-assembler-times "xsadddp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmindp" 1 } } */
+/* { dg-final { scan-assembler-times "xsmaxdp" 1 } } */
+/* { dg-final { scan-assembler-not "xxsldwi" } } */
+/* { dg-final { scan-assembler-not "stvx" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvw4x" } } */
+
+#include <stddef.h>
+
+#ifndef SIZE
+#define SIZE 1024
+#endif
+
+double values[SIZE] __attribute__((__aligned__(32)));
+
+double
+vector_sum (void)
+{
+  size_t i;
+  double sum = 0.0;
+
+  for (i = 0; i < SIZE; i++)
+    sum += values[i];
+
+  return sum;
+}
+
+double
+vector_min (void)
+{
+  size_t i;
+  double min = values[0];
+
+  for (i = 0; i < SIZE; i++)
+    min = __builtin_fmin (min, values[i]);
+
+  return min;
+}
+
+double
+vector_max (void)
+{
+  size_t i;
+  double max = values[0];
+
+  for (i = 0; i < SIZE; i++)
+    max = __builtin_fmax (max, values[i]);
+
+  return max;
+}