From 55b2ce1c919ae1a5460f8bfd105f64155853d701 Mon Sep 17 00:00:00 2001 From: Bill Schmidt Date: Tue, 5 Sep 2017 19:41:55 +0000 Subject: [PATCH] re PR target/81833 (PowerPC: VSX: Miscompiles ffmpeg's scalarproduct_int16_vsx at -O1) [gcc] 2017-09-05 Bill Schmidt PR target/81833 * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a define_insn to a define_expand. (altivec_vsum2sws_direct): New define_insn. (altivec_vsumsws): Convert from a define_insn to a define_expand. [gcc/testsuite] 2017-09-05 Bill Schmidt PR target/81833 * gcc.target/powerpc/pr81833-1.c: New file. * gcc.target/powerpc/pr81833-2.c: New file. From-SVN: r251723 --- gcc/ChangeLog | 8 ++ gcc/config/rs6000/altivec.md | 78 +++++++++++--------- gcc/testsuite/ChangeLog | 6 ++ gcc/testsuite/gcc.target/powerpc/pr81833-1.c | 59 +++++++++++++++ gcc/testsuite/gcc.target/powerpc/pr81833-2.c | 59 +++++++++++++++ 5 files changed, 176 insertions(+), 34 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr81833-1.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr81833-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 99a3a768804..158b5248433 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2017-09-05 Bill Schmidt + + PR target/81833 + * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a + define_insn to a define_expand. + (altivec_vsum2sws_direct): New define_insn. + (altivec_vsumsws): Convert from a define_insn to a define_expand. + 2017-09-05 Wilco Dijkstra * config/arm/arm.c (arm_option_params_internal): Improve setting of diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index c9dddda1b4c..0aa1e301690 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -1804,51 +1804,61 @@ "vsum4ss %0,%1,%2" [(set_attr "type" "veccomplex")]) -;; FIXME: For the following two patterns, the scratch should only be -;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should -;; be emitted separately. -(define_insn "altivec_vsum2sws" - [(set (match_operand:V4SI 0 "register_operand" "=v") - (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") - (match_operand:V4SI 2 "register_operand" "v")] - UNSPEC_VSUM2SWS)) - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) - (clobber (match_scratch:V4SI 3 "=v"))] +(define_expand "altivec_vsum2sws" + [(use (match_operand:V4SI 0 "register_operand")) + (use (match_operand:V4SI 1 "register_operand")) + (use (match_operand:V4SI 2 "register_operand"))] "TARGET_ALTIVEC" { if (VECTOR_ELT_ORDER_BIG) - return "vsum2sws %0,%1,%2"; + emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1], + operands[2])); else - return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4"; -} - [(set_attr "type" "veccomplex") - (set (attr "length") - (if_then_else - (match_test "VECTOR_ELT_ORDER_BIG") - (const_string "4") - (const_string "12")))]) - -(define_insn "altivec_vsumsws" + { + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2], + operands[2], GEN_INT (12))); + emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1)); + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, + GEN_INT (4))); + } + DONE; +}) + +; FIXME: This can probably be expressed without an UNSPEC. +(define_insn "altivec_vsum2sws_direct" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") - (match_operand:V4SI 2 "register_operand" "v")] - UNSPEC_VSUMSWS)) - (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR)) - (clobber (match_scratch:V4SI 3 "=v"))] + (match_operand:V4SI 2 "register_operand" "v")] + UNSPEC_VSUM2SWS)) + (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))] + "TARGET_ALTIVEC" + "vsum2sws %0,%1,%2" + [(set_attr "type" "veccomplex")]) + +(define_expand "altivec_vsumsws" + [(use (match_operand:V4SI 0 "register_operand")) + (use (match_operand:V4SI 1 "register_operand")) + (use (match_operand:V4SI 2 "register_operand"))] "TARGET_ALTIVEC" { if (VECTOR_ELT_ORDER_BIG) - return "vsumsws %0,%1,%2"; + emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1], + operands[2])); else - return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12"; -} - [(set_attr "type" "veccomplex") - (set (attr "length") - (if_then_else - (match_test "(VECTOR_ELT_ORDER_BIG)") - (const_string "4") - (const_string "12")))]) + { + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx)); + emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1)); + emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2, + GEN_INT (12))); + } + DONE; +}) +; FIXME: This can probably be expressed without an UNSPEC. (define_insn "altivec_vsumsws_direct" [(set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index a34c6923a0e..8d945ded851 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2017-09-05 Bill Schmidt + + PR target/81833 + * gcc.target/powerpc/pr81833-1.c: New file. + * gcc.target/powerpc/pr81833-2.c: New file. + 2017-09-05 H.J. Lu PR target/59501 diff --git a/gcc/testsuite/gcc.target/powerpc/pr81833-1.c b/gcc/testsuite/gcc.target/powerpc/pr81833-1.c new file mode 100644 index 00000000000..ca5ce006d20 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr81833-1.c @@ -0,0 +1,59 @@ +/* PR81833: This used to fail due to improper implementation of vec_msum. */ +/* Test case relies on -mcpu=power7 or later. Currently we don't have + machinery to express that, so we have two separate tests for -mcpu=power7 + and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */ + +/* { dg-do run } */ +/* { dg-require-effective-target p8vector_hw } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */ +/* { dg-options "-mcpu=power8 -O2" } */ + +#include + +#define vec_u8 vector unsigned char +#define vec_s8 vector signed char +#define vec_u16 vector unsigned short +#define vec_s16 vector signed short +#define vec_u32 vector unsigned int +#define vec_s32 vector signed int +#define vec_f vector float + +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0) + +#define zero_u8v (vec_u8) zerov +#define zero_s8v (vec_s8) zerov +#define zero_u16v (vec_u16) zerov +#define zero_s16v (vec_s16) zerov +#define zero_u32v (vec_u32) zerov +#define zero_s32v (vec_s32) zerov + +signed int __attribute__((noinline)) +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, + int order) +{ + int i; + LOAD_ZERO; + register vec_s16 vec1; + register vec_s32 res = vec_splat_s32 (0), t; + signed int ires; + + for (i = 0; i < order; i += 8) { + vec1 = vec_vsx_ld (0, v1); + t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v); + res = vec_sums (t, res); + v1 += 8; + v2 += 8; + } + res = vec_splat (res, 3); + vec_ste (res, 0, &ires); + + return ires; +} + +int main(void) +{ + const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 }; + if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8) + __builtin_abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr81833-2.c b/gcc/testsuite/gcc.target/powerpc/pr81833-2.c new file mode 100644 index 00000000000..e9286833469 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr81833-2.c @@ -0,0 +1,59 @@ +/* PR81833: This used to fail due to improper implementation of vec_msum. */ +/* Test case relies on -mcpu=power7 or later. Currently we don't have + machinery to express that, so we have two separate tests for -mcpu=power7 + and -mcpu=power8 to catch 32-bit BE on P7 and 64-bit BE/LE on P8. */ + +/* { dg-do run } */ +/* { dg-require-effective-target vsx_hw } */ +/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power7" } } */ +/* { dg-options "-mcpu=power7 -O2" } */ + +#include + +#define vec_u8 vector unsigned char +#define vec_s8 vector signed char +#define vec_u16 vector unsigned short +#define vec_s16 vector signed short +#define vec_u32 vector unsigned int +#define vec_s32 vector signed int +#define vec_f vector float + +#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0) + +#define zero_u8v (vec_u8) zerov +#define zero_s8v (vec_s8) zerov +#define zero_u16v (vec_u16) zerov +#define zero_s16v (vec_s16) zerov +#define zero_u32v (vec_u32) zerov +#define zero_s32v (vec_s32) zerov + +signed int __attribute__((noinline)) +scalarproduct_int16_vsx (const signed short *v1, const signed short *v2, + int order) +{ + int i; + LOAD_ZERO; + register vec_s16 vec1; + register vec_s32 res = vec_splat_s32 (0), t; + signed int ires; + + for (i = 0; i < order; i += 8) { + vec1 = vec_vsx_ld (0, v1); + t = vec_msum (vec1, vec_vsx_ld (0, v2), zero_s32v); + res = vec_sums (t, res); + v1 += 8; + v2 += 8; + } + res = vec_splat (res, 3); + vec_ste (res, 0, &ires); + + return ires; +} + +int main(void) +{ + const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 }; + if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8) + __builtin_abort (); + return 0; +} -- 2.30.2