From: Michael Meissner Date: Tue, 26 Apr 2011 17:48:29 +0000 (+0000) Subject: PR target/48258, improve vector reduction on power7 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=df10b6d444785aaf24fde0f692dfff7a7aa8ce3f;p=gcc.git PR target/48258, improve vector reduction on power7 From-SVN: r172981 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0749c64c384..99c029d748d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,27 @@ +2011-04-26 Michael Meissner + + PR target/48258 + * config/rs6000/vector.md (UNSPEC_REDUC): New unspec for vector + reduction. + (VEC_reduc): New code iterator and splitters for vector reduction. + (VEC_reduc_name): Ditto. + (VEC_reduc_rtx): Ditto. + (reduc__v2df): Vector reduction expanders for VSX. + (reduc__v4sf): Ditto. + + * config/rs6000/rs6000.c (rs6000_expand_vector_extract): Add + support for extracting SF on VSX. + + * config/rs6000/vsx.md (vsx_xscvspdp_scalar2): New insn for + generating xscvspdp. + (vsx_extract_v4sf): New insn to extract SF from V4SF vector. + (vsx_reduc__v2df): New insns and splitters for + double add, minimum, maximum vector reduction. + (vsx_reduc__v4sf): Ditto. + (vsx_reduc__v2df2_scalar): New combiner insn to + optimize double vector reduction. + (vsx_reduc__v4sf_scalar): Ditto. + 2011-04-26 Joseph Myers * config/fr30/fr30.h (inhibit_libc): Don't define. diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 6113a75b390..41259630bf4 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5463,12 +5463,22 @@ rs6000_expand_vector_extract (rtx target, rtx vec, int elt) enum machine_mode inner_mode = GET_MODE_INNER (mode); rtx mem; - if (VECTOR_MEM_VSX_P (mode) && (mode == V2DFmode || mode == V2DImode)) + if (VECTOR_MEM_VSX_P (mode)) { - rtx (*extract_func) (rtx, rtx, rtx) - = ((mode == V2DFmode) ? gen_vsx_extract_v2df : gen_vsx_extract_v2di); - emit_insn (extract_func (target, vec, GEN_INT (elt))); - return; + switch (mode) + { + default: + break; + case V2DFmode: + emit_insn (gen_vsx_extract_v2df (target, vec, GEN_INT (elt))); + return; + case V2DImode: + emit_insn (gen_vsx_extract_v2di (target, vec, GEN_INT (elt))); + return; + case V4SFmode: + emit_insn (gen_vsx_extract_v4sf (target, vec, GEN_INT (elt))); + return; + } } /* Allocate mode-sized buffer. */ diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index a3a8e124dd3..c0112507a6e 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -74,7 +74,19 @@ (V2DF "V2DI")]) ;; constants for unspec -(define_c_enum "unspec" [UNSPEC_PREDICATE]) +(define_c_enum "unspec" [UNSPEC_PREDICATE + UNSPEC_REDUC]) + +;; Vector reduction code iterators +(define_code_iterator VEC_reduc [plus smin smax]) + +(define_code_attr VEC_reduc_name [(plus "splus") + (smin "smin") + (smax "smax")]) + +(define_code_attr VEC_reduc_rtx [(plus "add") + (smin "smin") + (smax "smax")]) ;; Vector move instructions. @@ -991,6 +1003,41 @@ "TARGET_ALTIVEC" "") +;; Vector reduction expanders for VSX + +(define_expand "reduc__v2df" + [(parallel [(set (match_operand:V2DF 0 "vfloat_operand" "") + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1))) + (clobber (match_scratch:V2DF 2 ""))])] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "") + +; The (VEC_reduc:V4SF +; (op1) +; (unspec:V4SF [(const_int 0)] UNSPEC_REDUC)) +; +; is to allow us to use a code iterator, but not completely list all of the +; vector rotates, etc. to prevent canonicalization + +(define_expand "reduc__v4sf" + [(parallel [(set (match_operand:V4SF 0 "vfloat_operand" "") + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" ""))) + (clobber (match_scratch:V4SF 2 "")) + (clobber (match_scratch:V4SF 3 ""))])] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "") + + ;;; Expanders for vector insn patterns shared between the SPE and TARGET_PAIRED systems. (define_expand "absv2sf2" diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index fc331dc27ed..d4f529676e4 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -829,6 +829,15 @@ "xscvdpsp %x0,%x1" [(set_attr "type" "fp")]) +;; Same as vsx_xscvspdp, but use SF as the type +(define_insn "vsx_xscvspdp_scalar2" + [(set (match_operand:SF 0 "vsx_register_operand" "=f") + (unspec:SF [(match_operand:V4SF 1 "vsx_register_operand" "wa")] + UNSPEC_VSX_CVSPDP))] + "VECTOR_UNIT_VSX_P (DFmode)" + "xscvspdp %x0,%x1" + [(set_attr "type" "fp")]) + ;; Convert from 64-bit to 32-bit types ;; Note, favor the Altivec registers since the usual use of these instructions ;; is in vector converts and we need to use the Altivec vperm instruction. @@ -1039,6 +1048,43 @@ [(set_attr "type" "fpload") (set_attr "length" "4")]) +;; Extract a SF element from V4SF +(define_insn_and_split "vsx_extract_v4sf" + [(set (match_operand:SF 0 "vsx_register_operand" "=f,f") + (vec_select:SF + (match_operand:V4SF 1 "vsx_register_operand" "wa,wa") + (parallel [(match_operand:QI 2 "u5bit_cint_operand" "O,i")]))) + (clobber (match_scratch:V4SF 3 "=X,0"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "@ + xscvspdp %x0,%x1 + #" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx op2 = operands[2]; + rtx op3 = operands[3]; + rtx tmp; + HOST_WIDE_INT ele = INTVAL (op2); + + if (ele == 0) + tmp = op1; + else + { + if (GET_CODE (op3) == SCRATCH) + op3 = gen_reg_rtx (V4SFmode); + emit_insn (gen_vsx_xxsldwi_v4sf (op3, op1, op1, op2)); + tmp = op3; + } + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp)); + DONE; +}" + [(set_attr "length" "4,8") + (set_attr "type" "fp")]) + ;; General double word oriented permute, allow the other vector types for ;; optimizing the permute instruction. (define_insn "vsx_xxpermdi_" @@ -1150,3 +1196,153 @@ "VECTOR_MEM_VSX_P (mode)" "xxsldwi %x0,%x1,%x2,%3" [(set_attr "type" "vecperm")]) + + +;; Vector reduction insns and splitters + +(define_insn_and_split "*vsx_reduc__v2df" + [(set (match_operand:V2DF 0 "vfloat_operand" "=&wd,&?wa,wd,?wa") + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1))) + (clobber (match_scratch:V2DF 2 "=0,0,&wd,&wa"))] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx tmp = (GET_CODE (operands[2]) == SCRATCH) + ? gen_reg_rtx (V2DFmode) + : operands[2]; + emit_insn (gen_vsx_xxsldwi_v2df (tmp, operands[1], operands[1], const2_rtx)); + emit_insn (gen_v2df3 (operands[0], tmp, operands[1])); + DONE; +}" + [(set_attr "length" "8") + (set_attr "type" "veccomplex")]) + +(define_insn_and_split "*vsx_reduc__v4sf" + [(set (match_operand:V4SF 0 "vfloat_operand" "=wf,?wa") + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" "wf,wa"))) + (clobber (match_scratch:V4SF 2 "=&wf,&wa")) + (clobber (match_scratch:V4SF 3 "=&wf,&wa"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp2, tmp3, tmp4; + + if (can_create_pseudo_p ()) + { + tmp2 = gen_reg_rtx (V4SFmode); + tmp3 = gen_reg_rtx (V4SFmode); + tmp4 = gen_reg_rtx (V4SFmode); + } + else + { + tmp2 = operands[2]; + tmp3 = operands[3]; + tmp4 = tmp2; + } + + emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx)); + emit_insn (gen_v4sf3 (tmp3, tmp2, op1)); + emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3))); + emit_insn (gen_v4sf3 (op0, tmp4, tmp3)); + DONE; +}" + [(set_attr "length" "16") + (set_attr "type" "veccomplex")]) + +;; Combiner patterns with the vector reduction patterns that knows we can get +;; to the top element of the V2DF array without doing an extract. + +(define_insn_and_split "*vsx_reduc__v2df_scalar" + [(set (match_operand:DF 0 "vfloat_operand" "=&ws,&?wa,ws,?wa") + (vec_select:DF + (VEC_reduc:V2DF + (vec_concat:V2DF + (vec_select:DF + (match_operand:V2DF 1 "vfloat_operand" "wd,wa,wd,wa") + (parallel [(const_int 1)])) + (vec_select:DF + (match_dup 1) + (parallel [(const_int 0)]))) + (match_dup 1)) + (parallel [(const_int 1)]))) + (clobber (match_scratch:DF 2 "=0,0,&wd,&wa"))] + "VECTOR_UNIT_VSX_P (V2DFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx hi = gen_highpart (DFmode, operands[1]); + rtx lo = (GET_CODE (operands[2]) == SCRATCH) + ? gen_reg_rtx (DFmode) + : operands[2]; + + emit_insn (gen_vsx_extract_v2df (lo, operands[1], const1_rtx)); + emit_insn (gen_df3 (operands[0], hi, lo)); + DONE; +}" + [(set_attr "length" "8") + (set_attr "type" "veccomplex")]) + +(define_insn_and_split "*vsx_reduc__v4sf_scalar" + [(set (match_operand:SF 0 "vfloat_operand" "=f,?f") + (vec_select:SF + (VEC_reduc:V4SF + (unspec:V4SF [(const_int 0)] UNSPEC_REDUC) + (match_operand:V4SF 1 "vfloat_operand" "wf,wa")) + (parallel [(const_int 3)]))) + (clobber (match_scratch:V4SF 2 "=&wf,&wa")) + (clobber (match_scratch:V4SF 3 "=&wf,&wa")) + (clobber (match_scratch:V4SF 4 "=0,0"))] + "VECTOR_UNIT_VSX_P (V4SFmode)" + "#" + "" + [(const_int 0)] + " +{ + rtx op0 = operands[0]; + rtx op1 = operands[1]; + rtx tmp2, tmp3, tmp4, tmp5; + + if (can_create_pseudo_p ()) + { + tmp2 = gen_reg_rtx (V4SFmode); + tmp3 = gen_reg_rtx (V4SFmode); + tmp4 = gen_reg_rtx (V4SFmode); + tmp5 = gen_reg_rtx (V4SFmode); + } + else + { + tmp2 = operands[2]; + tmp3 = operands[3]; + tmp4 = tmp2; + tmp5 = operands[4]; + } + + emit_insn (gen_vsx_xxsldwi_v4sf (tmp2, op1, op1, const2_rtx)); + emit_insn (gen_v4sf3 (tmp3, tmp2, op1)); + emit_insn (gen_vsx_xxsldwi_v4sf (tmp4, tmp3, tmp3, GEN_INT (3))); + emit_insn (gen_v4sf3 (tmp5, tmp4, tmp3)); + emit_insn (gen_vsx_xscvspdp_scalar2 (op0, tmp5)); + DONE; +}" + [(set_attr "length" "20") + (set_attr "type" "veccomplex")]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 79cb509fb08..f6533cae2bb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,11 @@ +2011-03-23 Michael Meissner + + PR target/48258 + * gcc.target/powerpc/pr48258-1.c: New file. + * gcc.target/powerpc/pr48258-2.c: Ditto. + 2011-04-26 Xinliang David Li - + * gcc.dg/uninit-suppress.c: New test. * gcc.dg/uninit-suppress.c: New test. diff --git a/gcc/testsuite/gcc.target/powerpc/pr48258-1.c b/gcc/testsuite/gcc.target/powerpc/pr48258-1.c new file mode 100644 index 00000000000..4f37815d384 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr48258-1.c @@ -0,0 +1,57 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */ +/* { dg-final { scan-assembler-times "xvaddsp" 3 } } */ +/* { dg-final { scan-assembler-times "xvminsp" 3 } } */ +/* { dg-final { scan-assembler-times "xvmaxsp" 3 } } */ +/* { dg-final { scan-assembler-times "xxsldwi" 6 } } */ +/* { dg-final { scan-assembler-times "xscvspdp" 3 } } */ +/* { dg-final { scan-assembler-not "stvewx" } } */ +/* { dg-final { scan-assembler-not "stvx" } } */ +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvw4x" } } */ + +#include + +#ifndef SIZE +#define SIZE 1024 +#endif + +float values[SIZE] __attribute__((__aligned__(32))); + +float +vector_sum (void) +{ + size_t i; + float sum = 0.0f; + + for (i = 0; i < SIZE; i++) + sum += values[i]; + + return sum; +} + +float +vector_min (void) +{ + size_t i; + float min = values[0]; + + for (i = 0; i < SIZE; i++) + min = __builtin_fminf (min, values[i]); + + return min; +} + +float +vector_max (void) +{ + size_t i; + float max = values[0]; + + for (i = 0; i < SIZE; i++) + max = __builtin_fmaxf (max, values[i]); + + return max; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr48258-2.c b/gcc/testsuite/gcc.target/powerpc/pr48258-2.c new file mode 100644 index 00000000000..443fb624e30 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr48258-2.c @@ -0,0 +1,58 @@ +/* { dg-do compile } */ +/* { dg-skip-if "" { powerpc*-*-darwin* } { "*" } { "" } } */ +/* { dg-require-effective-target powerpc_vsx_ok } */ +/* { dg-options "-O3 -mcpu=power7 -mabi=altivec -ffast-math -fno-unroll-loops" } */ +/* { dg-final { scan-assembler-times "xvadddp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmindp" 1 } } */ +/* { dg-final { scan-assembler-times "xvmaxdp" 1 } } */ +/* { dg-final { scan-assembler-times "xsadddp" 1 } } */ +/* { dg-final { scan-assembler-times "xsmindp" 1 } } */ +/* { dg-final { scan-assembler-times "xsmaxdp" 1 } } */ +/* { dg-final { scan-assembler-not "xxsldwi" } } */ +/* { dg-final { scan-assembler-not "stvx" } } */ +/* { dg-final { scan-assembler-not "stxvd2x" } } */ +/* { dg-final { scan-assembler-not "stxvw4x" } } */ + +#include + +#ifndef SIZE +#define SIZE 1024 +#endif + +double values[SIZE] __attribute__((__aligned__(32))); + +double +vector_sum (void) +{ + size_t i; + double sum = 0.0; + + for (i = 0; i < SIZE; i++) + sum += values[i]; + + return sum; +} + +double +vector_min (void) +{ + size_t i; + double min = values[0]; + + for (i = 0; i < SIZE; i++) + min = __builtin_fmin (min, values[i]); + + return min; +} + +double +vector_max (void) +{ + size_t i; + double max = values[0]; + + for (i = 0; i < SIZE; i++) + max = __builtin_fmax (max, values[i]); + + return max; +}