From df81764ba1a276d9b48f408bd2dd1e71e09e7863 Mon Sep 17 00:00:00 2001 From: Tamar Christina Date: Wed, 10 Jun 2020 11:55:46 +0100 Subject: [PATCH] AArch64: Adjust costing of by element MUL to be the same as SAME3 MUL. The cost model is currently treating multiplication by element as being more expensive than 3 same multiplication. This means that if the value is on the SIMD side we add an unneeded DUP. If the value is on the genreg side we use the more expensive DUP instead of fmov. This patch corrects the costs such that the two multiplies are costed the same which allows us to generate fmul v3.4s, v3.4s, v0.s[0] instead of dup v0.4s, v0.s[0] fmul v3.4s, v3.4s, v0.4s gcc/ChangeLog: * config/aarch64/aarch64.c (aarch64_rtx_mult_cost): Adjust costs for mul. gcc/testsuite/ChangeLog: * gcc.target/aarch64/asimd-mull-elem.c: New test. --- gcc/config/aarch64/aarch64.c | 18 +++++++++++++++- .../gcc.target/aarch64/asimd-mull-elem.c | 21 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 973c65aa4fb..f3551a73d87 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -11279,7 +11279,23 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) op1 = XEXP (x, 1); if (VECTOR_MODE_P (mode)) - mode = GET_MODE_INNER (mode); + { + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + mode = GET_MODE_INNER (mode); + if (vec_flags & VEC_ADVSIMD) + { + /* The by-element versions of the instruction have the same costs as + the normal 3-vector version. So don't add the costs of the + duplicate into the costs of the multiply. We make an assumption + that the input to the VEC_DUPLICATE is already on the FP & SIMD + side. This means costing of a MUL by element pre RA is a bit + optimistic. */ + if (GET_CODE (op0) == VEC_DUPLICATE) + op0 = XEXP (op0, 0); + else if (GET_CODE (op1) == VEC_DUPLICATE) + op1 = XEXP (op1, 0); + } + } /* Integer multiply/fma. */ if (GET_MODE_CLASS (mode) == MODE_INT) diff --git a/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c new file mode 100644 index 00000000000..513721cee0c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/asimd-mull-elem.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_float } */ +/* { dg-options "-Ofast" } */ + +#include + +void s_mult_i (int32_t* restrict res, int32_t* restrict a, int32_t b) +{ + for (int x = 0; x < 16; x++) + res[x] = a[x] * b; +} + +void s_mult_f (float32_t* restrict res, float32_t* restrict a, float32_t b) +{ + for (int x = 0; x < 16; x++) + res[x] = a[x] * b; +} + +/* { dg-final { scan-assembler-times {\s+mul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */ +/* { dg-final { scan-assembler-times {\s+fmul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */ -- 2.30.2