From: Jakub Jelinek Date: Tue, 22 Mar 2016 08:28:49 +0000 (+0100) Subject: re PR target/70329 (wrong code with -mavx512bw and 64byte vectors) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b01915ed169065d75895d504a97337f071d9c4f7;p=gcc.git re PR target/70329 (wrong code with -mavx512bw and 64byte vectors) PR target/70329 * config/i386/i386.c (ix86_expand_vecop_qihi): Don't bother computing d.perm[i] for i >= d.nelt. If not full_interleave, compute d.perm[i] in a way that works also for AVX512BW. * gcc.target/i386/avx512bw-pr70329-1.c: New test. * gcc.target/i386/avx512bw-pr70329-2.c: New test. From-SVN: r234394 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f5687a507e5..6dc6643b51e 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,10 @@ 2016-03-22 Jakub Jelinek + PR target/70329 + * config/i386/i386.c (ix86_expand_vecop_qihi): Don't bother computing + d.perm[i] for i >= d.nelt. If not full_interleave, compute d.perm[i] + in a way that works also for AVX512BW. + PR target/70300 * config/i386/i386.md (cvtsd2ss splitter): Unpack in destination instead of source if operands[1] is xmm16 and above and diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 3d8dbc48e5f..a4aad4feafa 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -51910,16 +51910,24 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 64; ++i) + for (i = 0; i < d.nelt; ++i) d.perm[i] = i * 2; } else { /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. - Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 64; ++i) - d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); + Happily, that is even one insn shorter than even extraction. + For AVX512BW we have 4 lanes. We extract evens from within a lane, + always first from the first and then from the second source operand, + the index bits above the low 4 bits remains the same. + Thus, for d.nelt == 32 we want permutation + 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62 + and for d.nelt == 64 we want permutation + 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94, + 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */ + for (i = 0; i < d.nelt; ++i) + d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); } ok = ix86_expand_vec_perm_const_1 (&d); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index aad5d3f241e..73b4eb286d1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2016-03-22 Jakub Jelinek + PR target/70329 + * gcc.target/i386/avx512bw-pr70329-1.c: New test. + * gcc.target/i386/avx512bw-pr70329-2.c: New test. + PR target/70300 * gcc.target/i386/pr70300.c: New test. diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c new file mode 100644 index 00000000000..bb9a9551d0c --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c @@ -0,0 +1,27 @@ +/* PR target/70329 */ +/* { dg-do run } */ +/* { dg-options "-O0 -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#define AVX512BW +#include "avx512f-helper.h" + +typedef unsigned char A __attribute__ ((vector_size (64))); +typedef unsigned int B __attribute__ ((vector_size (64))); + +unsigned __attribute__ ((noinline, noclone)) +foo (A a, A b, B c) +{ + a *= b; + c[1] += a[8]; + return c[1]; +} + +void +TEST (void) +{ + A a = (A) { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + unsigned x = foo (a, a, (B) { 1, 2 }); + if (x != 83) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c new file mode 100644 index 00000000000..731b9260794 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c @@ -0,0 +1,33 @@ +/* PR target/70329 */ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-vectorize -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#define AVX512BW +#include "avx512f-helper.h" + +__attribute__((noinline, noclone)) void +foo (unsigned char *src1, unsigned char *src2, unsigned char *dst) +{ + int i; + + for (i = 0; i < 64; i++) + dst[i] = (unsigned char) ((int) src1[i] * (int) src2[i]); +} + +void +TEST (void) +{ + unsigned char a[64], b[64], c[64]; + int i; + + for (i = 0; i < 64; i++) + { + a[i] = i; + b[i] = (i + 1); + } + foo (a, b, c); + for (i = 0; i < 64; i++) + if (c[i] != (unsigned char) (i * (i + 1))) + abort (); +}