From 0f41b5e02fa47db2080b77e4e1f7cd3305457c05 Mon Sep 17 00:00:00 2001 From: Dennis Zhang Date: Thu, 22 Oct 2020 01:09:33 +0100 Subject: [PATCH] arm: Auto-vectorization for MVE: vmul This patch enables MVE vmul instructions for auto-vectorization. It includes MVE in expander mul3 to enable vectorization for MVE. Related MVE vmul insns are modified to support the expander by using expression 'mult' instead of unspec. The mul3 for vectorization in vec-common.md uses mode iterator VDQWH instead of VALLW to cover all supported modes. The macros ARM_HAVE_NEON__ARITH are used to select supported modes for different targets. The redundant mul3 in neon.md is removed. gcc/ChangeLog: 2020-10-22 Dennis Zhang * config/arm/mve.md (mve_vmulq): New entry for vmul instruction using expression 'mult'. (mve_vmulq_f): Use mult instead of VMULQ_F. * config/arm/neon.md (mul3): Removed. * config/arm/vec-common.md (mul3): Use the new mode macros ARM_HAVE__ARITH. Use mode iterator VDQWH instead of VALLW. gcc/testsuite/ChangeLog: * gcc.target/arm/simd/mve-vmul_1.c: New test. --- gcc/ChangeLog | 9 +++ gcc/config/arm/mve.md | 16 ++++- gcc/config/arm/neon.md | 11 ---- gcc/config/arm/vec-common.md | 13 ++-- gcc/testsuite/ChangeLog | 4 ++ .../gcc.target/arm/simd/mve-vmul_1.c | 64 +++++++++++++++++++ 6 files changed, 95 insertions(+), 22 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index a44bc0995ff..d6a326cbbc0 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2020-10-22 Dennis Zhang + + * config/arm/mve.md (mve_vmulq): New entry for vmul instruction + using expression 'mult'. + (mve_vmulq_f): Use mult instead of VMULQ_F. + * config/arm/neon.md (mul3): Removed. + * config/arm/vec-common.md (mul3): Use the new mode macros + ARM_HAVE__ARITH. Use mode iterator VDQWH instead of VALLW. + 2020-10-20 Andrew MacLeod PR tree-optimization/97505 diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 5dad38899be..764e2016316 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -1551,6 +1551,17 @@ [(set_attr "type" "mve_move") ]) +(define_insn "mve_vmulq" + [ + (set (match_operand:MVE_2 0 "s_register_operand" "=w") + (mult:MVE_2 (match_operand:MVE_2 1 "s_register_operand" "w") + (match_operand:MVE_2 2 "s_register_operand" "w"))) + ] + "TARGET_HAVE_MVE" + "vmul.i%#\t%q0, %q1, %q2" + [(set_attr "type" "mve_move") +]) + ;; ;; [vornq_u, vornq_s]) ;; @@ -2562,9 +2573,8 @@ (define_insn "mve_vmulq_f" [ (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VMULQ_F)) + (mult:MVE_0 (match_operand:MVE_0 1 "s_register_operand" "w") + (match_operand:MVE_0 2 "s_register_operand" "w"))) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" "vmul.f%# %q0, %q1, %q2" diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 85e424e6cf4..e459b9ac8ef 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -1732,17 +1732,6 @@ (const_string "neon_mul_")))] ) -(define_insn "mul3" - [(set - (match_operand:VH 0 "s_register_operand" "=w") - (mult:VH - (match_operand:VH 1 "s_register_operand" "w") - (match_operand:VH 2 "s_register_operand" "w")))] - "ARM_HAVE_NEON__ARITH" - "vmul.f16\t%0, %1, %2" - [(set_attr "type" "neon_mul_")] -) - (define_insn "neon_vmulf" [(set (match_operand:VH 0 "s_register_operand" "=w") diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index c3c86c46355..45db60e7411 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -101,14 +101,11 @@ }) (define_expand "mul3" - [(set (match_operand:VALLW 0 "s_register_operand") - (mult:VALLW (match_operand:VALLW 1 "s_register_operand") - (match_operand:VALLW 2 "s_register_operand")))] - "(TARGET_NEON && ((mode != V2SFmode && mode != V4SFmode) - || flag_unsafe_math_optimizations)) - || (mode == V4HImode && TARGET_REALLY_IWMMXT)" -{ -}) + [(set (match_operand:VDQWH 0 "s_register_operand") + (mult:VDQWH (match_operand:VDQWH 1 "s_register_operand") + (match_operand:VDQWH 2 "s_register_operand")))] + "ARM_HAVE__ARITH" +) (define_expand "smin3" [(set (match_operand:VALLW 0 "s_register_operand") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 649f2bd0c63..eb55a64b8f8 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2020-10-22 Dennis Zhang + + * gcc.target/arm/simd/mve-vmul_1.c: New test. + 2020-10-20 Jeff Law * gcc.dg/Wbuiltin-declaration-mismatch-9.c: Improve pruning of diff --git a/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c new file mode 100644 index 00000000000..514f292c15e --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/mve-vmul_1.c @@ -0,0 +1,64 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */ +/* { dg-add-options arm_v8_1m_mve_fp } */ +/* { dg-additional-options "-O3" } */ + +#include + +void test_vmul_i32 (int32_t * dest, int32_t * a, int32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i32_u (uint32_t * dest, uint32_t * a, uint32_t * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i32\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i16 (int16_t * dest, int16_t * a, int16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i16_u (uint16_t * dest, uint16_t * a, uint16_t * b) { + int i; + for (i=0; i<8; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_i8 (int8_t * dest, int8_t * a, int8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +void test_vmul_i8_u (uint8_t * dest, uint8_t * a, uint8_t * b) { + int i; + for (i=0; i<16; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.i8\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */ + +void test_vmul_f32 (float * dest, float * a, float * b) { + int i; + for (i=0; i<4; i++) { + dest[i] = a[i] * b[i]; + } +} + +/* { dg-final { scan-assembler-times {vmul\.f32\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */ + -- 2.30.2