From 434641a57b90584bffa4e8def3f900c7d102bfdf Mon Sep 17 00:00:00 2001 From: Christophe Lyon Date: Tue, 4 Sep 2012 08:32:39 +0000 Subject: [PATCH] arm.c (arm_evpc_neon_vext): New function. 2012-09-04 Christophe Lyon gcc/ * config/arm/arm.c (arm_evpc_neon_vext): New function. (arm_expand_vec_perm_const_1): Add call to arm_evpc_neon_vext. gcc/testsuite/ * gcc.target/arm/neon-vext.c: New test. * gcc.target/arm/neon-vext-execute.c: Ditto. From-SVN: r190911 --- gcc/ChangeLog | 7 + gcc/config/arm/arm.c | 72 ++++ gcc/testsuite/ChangeLog | 5 + .../gcc.target/arm/neon-vext-execute.c | 340 ++++++++++++++++++ gcc/testsuite/gcc.target/arm/neon-vext.c | 115 ++++++ 5 files changed, 539 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/neon-vext-execute.c create mode 100644 gcc/testsuite/gcc.target/arm/neon-vext.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 059967a8c3b..cda27fd74df 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2012-09-04 Christophe Lyon + + * config/arm/arm.c (arm_evpc_neon_vext): New + function. + (arm_expand_vec_perm_const_1): Add call to + arm_evpc_neon_vext. + 2012-09-04 Oleg Endo PR target/51244 diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c index 9ce3c0f068a..36937d2b90d 100644 --- a/gcc/config/arm/arm.c +++ b/gcc/config/arm/arm.c @@ -25937,6 +25937,72 @@ arm_evpc_neon_vtrn (struct expand_vec_perm_d *d) return true; } +/* Recognize patterns for the VEXT insns. */ + +static bool +arm_evpc_neon_vext (struct expand_vec_perm_d *d) +{ + unsigned int i, nelt = d->nelt; + rtx (*gen) (rtx, rtx, rtx, rtx); + rtx offset; + + unsigned int location; + + unsigned int next = d->perm[0] + 1; + + /* TODO: Handle GCC's numbering of elements for big-endian. */ + if (BYTES_BIG_ENDIAN) + return false; + + /* Check if the extracted indexes are increasing by one. */ + for (i = 1; i < nelt; next++, i++) + { + /* If we hit the most significant element of the 2nd vector in + the previous iteration, no need to test further. */ + if (next == 2 * nelt) + return false; + + /* If we are operating on only one vector: it could be a + rotation. If there are only two elements of size < 64, let + arm_evpc_neon_vrev catch it. */ + if (d->one_vector_p && (next == nelt)) + { + if ((nelt == 2) && (d->vmode != V2DImode)) + return false; + else + next = 0; + } + + if (d->perm[i] != next) + return false; + } + + location = d->perm[0]; + + switch (d->vmode) + { + case V16QImode: gen = gen_neon_vextv16qi; break; + case V8QImode: gen = gen_neon_vextv8qi; break; + case V4HImode: gen = gen_neon_vextv4hi; break; + case V8HImode: gen = gen_neon_vextv8hi; break; + case V2SImode: gen = gen_neon_vextv2si; break; + case V4SImode: gen = gen_neon_vextv4si; break; + case V2SFmode: gen = gen_neon_vextv2sf; break; + case V4SFmode: gen = gen_neon_vextv4sf; break; + case V2DImode: gen = gen_neon_vextv2di; break; + default: + return false; + } + + /* Success! */ + if (d->testing_p) + return true; + + offset = GEN_INT (location); + emit_insn (gen (d->target, d->op0, d->op1, offset)); + return true; +} + /* The NEON VTBL instruction is a fully variable permuation that's even stronger than what we expose via VEC_PERM_EXPR. What it doesn't do is mask the index operand as VEC_PERM_EXPR requires. Therefore we @@ -25976,6 +26042,12 @@ arm_evpc_neon_vtbl (struct expand_vec_perm_d *d) static bool arm_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) { + /* Check if the input mask matches vext before reordering the + operands. */ + if (TARGET_NEON) + if (arm_evpc_neon_vext (d)) + return true; + /* The pattern matching functions above are written to look for a small number to begin the sequence (0, 1, N/2). If we begin with an index from the second operand, we can swap the operands. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 163eeab5897..4a0532188cd 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2012-09-04 Christophe Lyon + + * gcc.target/arm/neon-vext.c: New test. + * gcc.target/arm/neon-vext-execute.c: Ditto. + 2012-09-04 Janus Weil PR fortran/54243 diff --git a/gcc/testsuite/gcc.target/arm/neon-vext-execute.c b/gcc/testsuite/gcc.target/arm/neon-vext-execute.c new file mode 100644 index 00000000000..3d6c28cca89 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vext-execute.c @@ -0,0 +1,340 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_neon_ok } */ +/* { dg-require-effective-target arm_little_endian } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_neon } */ + +#include +#include +#include + +uint8x8_t +tst_vext_u8 (uint8x8_t __a, uint8x8_t __b) +{ + uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9}; + + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint8x8_t +tst_vext_u8_rotate (uint8x8_t __a) +{ + uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x4_t +tst_vext_u16 (uint16x4_t __a, uint16x4_t __b) +{ + uint16x4_t __mask1 = {2, 3, 4, 5}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint16x4_t +tst_vext_u16_rotate (uint16x4_t __a) +{ + uint16x4_t __mask1 = {2, 3, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint32x2_t +tst_vext_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2_t __mask1 = {1, 2}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +/* This one is mapped into vrev64.32. */ +uint32x2_t +tst_vext_u32_rotate (uint32x2_t __a) +{ + uint32x2_t __mask1 = {1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint8x16_t +tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint8x16_t +tst_vextq_u8_rotate (uint8x16_t __a) +{ + uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x8_t +tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint16x8_t +tst_vextq_u16_rotate (uint16x8_t __a) +{ + uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint32x4_t +tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + uint32x4_t __mask1 = {1, 2, 3, 4}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint32x4_t +tst_vextq_u32_rotate (uint32x4_t __a) +{ + uint32x4_t __mask1 = {1, 2, 3, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint64x2_t +tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + uint64x2_t __mask1 = {1, 2}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint64x2_t +tst_vextq_u64_rotate (uint64x2_t __a) +{ + uint64x2_t __mask1 = {1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +int main (void) +{ + uint8_t arr_u8x8[] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8_t arr2_u8x8[] = {8, 9, 10, 11, 12, 13, 14, 15}; + uint16_t arr_u16x4[] = {0, 1, 2, 3}; + uint16_t arr2_u16x4[] = {4, 5, 6, 7}; + uint32_t arr_u32x2[] = {0, 1}; + uint32_t arr2_u32x2[] = {2, 3}; + uint8_t arr_u8x16[] = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + uint8_t arr2_u8x16[] = {16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31}; + uint16_t arr_u16x8[] = {0, 1, 2, 3, 4, 5, 6, 7}; + uint16_t arr2_u16x8[] = {8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t arr_u32x4[] = {0, 1, 2, 3}; + uint32_t arr2_u32x4[] = {4, 5, 6, 7}; + uint64_t arr_u64x2[] = {0, 1}; + uint64_t arr2_u64x2[] = {2, 3}; + + uint8_t expected_u8x8[] = {2, 3, 4, 5, 6, 7, 8, 9}; + uint8_t expected_rot_u8x8[] = {2, 3, 4, 5, 6, 7, 0, 1}; + uint16_t expected_u16x4[] = {2, 3, 4, 5}; + uint16_t expected_rot_u16x4[] = {2, 3, 0, 1}; + uint32_t expected_u32x2[] = {1, 2}; + uint32_t expected_rot_u32x2[] = {1, 0}; + uint8_t expected_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19}; + uint8_t expected_rot_u8x16[] = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3,}; + uint16_t expected_u16x8[] = {2, 3, 4, 5, 6, 7, 8, 9}; + uint16_t expected_rot_u16x8[] = {2, 3, 4, 5, 6, 7, 0, 1}; + uint32_t expected_u32x4[] = {1, 2, 3, 4}; + uint32_t expected_rot_u32x4[] = {1, 2, 3, 0}; + uint64_t expected_u64x2[] = {1, 2}; + uint64_t expected_rot_u64x2[] = {1, 0}; + + uint8x8_t vec_u8x8 = vld1_u8 (arr_u8x8); + uint8x8_t vec2_u8x8 = vld1_u8 (arr2_u8x8); + uint16x4_t vec_u16x4 = vld1_u16 (arr_u16x4); + uint16x4_t vec2_u16x4 = vld1_u16 (arr2_u16x4); + uint32x2_t vec_u32x2 = vld1_u32 (arr_u32x2); + uint32x2_t vec2_u32x2 = vld1_u32 (arr2_u32x2); + uint8x16_t vec_u8x16 = vld1q_u8 (arr_u8x16); + uint8x16_t vec2_u8x16 = vld1q_u8 (arr2_u8x16); + uint16x8_t vec_u16x8 = vld1q_u16 (arr_u16x8); + uint16x8_t vec2_u16x8 = vld1q_u16 (arr2_u16x8); + uint32x4_t vec_u32x4 = vld1q_u32 (arr_u32x4); + uint32x4_t vec2_u32x4 = vld1q_u32 (arr2_u32x4); + uint64x2_t vec_u64x2 = vld1q_u64 (arr_u64x2); + uint64x2_t vec2_u64x2 = vld1q_u64 (arr2_u64x2); + + uint8x8_t result_u8x8; + uint16x4_t result_u16x4; + uint32x2_t result_u32x2; + uint8x16_t result_u8x16; + uint16x8_t result_u16x8; + uint32x4_t result_u32x4; + uint64x2_t result_u64x2; + + union {uint8x8_t v; uint8_t buf[8];} mem_u8x8; + union {uint16x4_t v; uint16_t buf[4];} mem_u16x4; + union {uint32x2_t v; uint32_t buf[2];} mem_u32x2; + union {uint8x16_t v; uint8_t buf[16];} mem_u8x16; + union {uint16x8_t v; uint16_t buf[8];} mem_u16x8; + union {uint32x4_t v; uint32_t buf[4];} mem_u32x4; + union {uint64x2_t v; uint64_t buf[2];} mem_u64x2; + + int i; + + result_u8x8 = tst_vext_u8 (vec_u8x8, vec2_u8x8); + vst1_u8 (mem_u8x8.buf, result_u8x8); + + for (i=0; i<8; i++) + if (mem_u8x8.buf[i] != expected_u8x8[i]) + { + printf ("tst_vext_u8[%d]=%d expected %d\n", + i, mem_u8x8.buf[i], expected_u8x8[i]); + abort (); + } + + result_u8x8 = tst_vext_u8_rotate (vec_u8x8); + vst1_u8 (mem_u8x8.buf, result_u8x8); + + for (i=0; i<8; i++) + if (mem_u8x8.buf[i] != expected_rot_u8x8[i]) + { + printf ("tst_vext_u8_rotate[%d]=%d expected %d\n", + i, mem_u8x8.buf[i], expected_rot_u8x8[i]); + abort (); + } + + + result_u16x4 = tst_vext_u16 (vec_u16x4, vec2_u16x4); + vst1_u16 (mem_u16x4.buf, result_u16x4); + + for (i=0; i<4; i++) + if (mem_u16x4.buf[i] != expected_u16x4[i]) + { + printf ("tst_vext_u16[%d]=%d expected %d\n", + i, mem_u16x4.buf[i], expected_u16x4[i]); + abort (); + } + + result_u16x4 = tst_vext_u16_rotate (vec_u16x4); + vst1_u16 (mem_u16x4.buf, result_u16x4); + + for (i=0; i<4; i++) + if (mem_u16x4.buf[i] != expected_rot_u16x4[i]) + { + printf ("tst_vext_u16_rotate[%d]=%d expected %d\n", + i, mem_u16x4.buf[i], expected_rot_u16x4[i]); + abort (); + } + + + result_u32x2 = tst_vext_u32 (vec_u32x2, vec2_u32x2); + vst1_u32 (mem_u32x2.buf, result_u32x2); + + for (i=0; i<2; i++) + if (mem_u32x2.buf[i] != expected_u32x2[i]) + { + printf ("tst_vext_u32[%d]=%d expected %d\n", + i, mem_u32x2.buf[i], expected_u32x2[i]); + abort (); + } + + result_u32x2 = tst_vext_u32_rotate (vec_u32x2); + vst1_u32 (mem_u32x2.buf, result_u32x2); + + for (i=0; i<2; i++) + if (mem_u32x2.buf[i] != expected_rot_u32x2[i]) + { + printf ("tst_vext_u32_rotate[%d]=%d expected %d\n", + i, mem_u32x2.buf[i], expected_rot_u32x2[i]); + abort (); + } + + + result_u8x16 = tst_vextq_u8 (vec_u8x16, vec2_u8x16); + vst1q_u8 (mem_u8x16.buf, result_u8x16); + + for (i=0; i<16; i++) + if (mem_u8x16.buf[i] != expected_u8x16[i]) + { + printf ("tst_vextq_u8[%d]=%d expected %d\n", + i, mem_u8x16.buf[i], expected_u8x16[i]); + abort (); + } + + result_u8x16 = tst_vextq_u8_rotate (vec_u8x16); + vst1q_u8 (mem_u8x16.buf, result_u8x16); + + for (i=0; i<16; i++) + if (mem_u8x16.buf[i] != expected_rot_u8x16[i]) + { + printf ("tst_vextq_u8_rotate[%d]=%d expected %d\n", + i, mem_u8x16.buf[i], expected_rot_u8x16[i]); + abort (); + } + + result_u16x8 = tst_vextq_u16 (vec_u16x8, vec2_u16x8); + vst1q_u16 (mem_u16x8.buf, result_u16x8); + + for (i=0; i<8; i++) + if (mem_u16x8.buf[i] != expected_u16x8[i]) + { + printf ("tst_vextq_u16[%d]=%d expected %d\n", + i, mem_u16x8.buf[i], expected_u16x8[i]); + abort (); + } + + result_u16x8 = tst_vextq_u16_rotate (vec_u16x8); + vst1q_u16 (mem_u16x8.buf, result_u16x8); + + for (i=0; i<8; i++) + if (mem_u16x8.buf[i] != expected_rot_u16x8[i]) + { + printf ("tst_vextq_u16_rotate[%d]=%d expected %d\n", + i, mem_u16x8.buf[i], expected_rot_u16x8[i]); + abort (); + } + + result_u32x4 = tst_vextq_u32 (vec_u32x4, vec2_u32x4); + vst1q_u32 (mem_u32x4.buf, result_u32x4); + + for (i=0; i<4; i++) + if (mem_u32x4.buf[i] != expected_u32x4[i]) + { + printf ("tst_vextq_u32[%d]=%d expected %d\n", + i, mem_u32x4.buf[i], expected_u32x4[i]); + abort (); + } + + result_u32x4 = tst_vextq_u32_rotate (vec_u32x4); + vst1q_u32 (mem_u32x4.buf, result_u32x4); + + for (i=0; i<4; i++) + if (mem_u32x4.buf[i] != expected_rot_u32x4[i]) + { + printf ("tst_vextq_u32_rotate[%d]=%d expected %d\n", + i, mem_u32x4.buf[i], expected_rot_u32x4[i]); + abort (); + } + + result_u64x2 = tst_vextq_u64 (vec_u64x2, vec2_u64x2); + vst1q_u64 (mem_u64x2.buf, result_u64x2); + + for (i=0; i<2; i++) + if (mem_u64x2.buf[i] != expected_u64x2[i]) + { + printf ("tst_vextq_u64[%d]=%lld expected %lld\n", + i, mem_u64x2.buf[i], expected_u64x2[i]); + abort (); + } + + result_u64x2 = tst_vextq_u64_rotate (vec_u64x2); + vst1q_u64 (mem_u64x2.buf, result_u64x2); + + for (i=0; i<2; i++) + if (mem_u64x2.buf[i] != expected_rot_u64x2[i]) + { + printf ("tst_vextq_u64_rotate[%d]=%lld expected %lld\n", + i, mem_u64x2.buf[i], expected_rot_u64x2[i]); + abort (); + } + + return 0; +} diff --git a/gcc/testsuite/gcc.target/arm/neon-vext.c b/gcc/testsuite/gcc.target/arm/neon-vext.c new file mode 100644 index 00000000000..4a012a996a8 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vext.c @@ -0,0 +1,115 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_neon_ok } */ +/* { dg-require-effective-target arm_little_endian } */ +/* { dg-options "-O2" } */ +/* { dg-add-options arm_neon } */ + +#include + +uint8x8_t +tst_vext_u8 (uint8x8_t __a, uint8x8_t __b) +{ + uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9}; + + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint8x8_t +tst_vext_u8_rotate (uint8x8_t __a) +{ + uint8x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x4_t +tst_vext_u16 (uint16x4_t __a, uint16x4_t __b) +{ + uint16x4_t __mask1 = {2, 3, 4, 5}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint16x4_t +tst_vext_u16_rotate (uint16x4_t __a) +{ + uint16x4_t __mask1 = {2, 3, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint32x2_t +tst_vext_u32 (uint32x2_t __a, uint32x2_t __b) +{ + uint32x2_t __mask1 = {1, 2}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +/* This one is mapped into vrev64.32. */ +uint32x2_t +tst_vext_u32_rotate (uint32x2_t __a) +{ + uint32x2_t __mask1 = {1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint8x16_t +tst_vextq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint8x16_t +tst_vextq_u8_rotate (uint8x16_t __a) +{ + uint8x16_t __mask1 = {4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 0, 1, 2, 3}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint16x8_t +tst_vextq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 8, 9}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint16x8_t +tst_vextq_u16_rotate (uint16x8_t __a) +{ + uint16x8_t __mask1 = {2, 3, 4, 5, 6, 7, 0, 1}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint32x4_t +tst_vextq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + uint32x4_t __mask1 = {1, 2, 3, 4}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint32x4_t +tst_vextq_u32_rotate (uint32x4_t __a) +{ + uint32x4_t __mask1 = {1, 2, 3, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +uint64x2_t +tst_vextq_u64 (uint64x2_t __a, uint64x2_t __b) +{ + uint64x2_t __mask1 = {1, 2}; + return __builtin_shuffle ( __a, __b, __mask1) ; +} + +uint64x2_t +tst_vextq_u64_rotate (uint64x2_t __a) +{ + uint64x2_t __mask1 = {1, 0}; + return __builtin_shuffle ( __a, __mask1) ; +} + +/* { dg-final {scan-assembler-times "vext\.8\\t" 4} } */ +/* { dg-final {scan-assembler-times "vext\.16\\t" 4} } */ +/* { dg-final {scan-assembler-times "vext\.32\\t" 3} } */ +/* { dg-final {scan-assembler-times "vrev64\.32\\t" 1} } */ +/* { dg-final {scan-assembler-times "vext\.64\\t" 2} } */ -- 2.30.2