op1 = XEXP (x, 1);
if (VECTOR_MODE_P (mode))
- mode = GET_MODE_INNER (mode);
+ {
+ unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+ mode = GET_MODE_INNER (mode);
+ if (vec_flags & VEC_ADVSIMD)
+ {
+ /* The by-element versions of the instruction have the same costs as
+ the normal 3-vector version. So don't add the costs of the
+ duplicate into the costs of the multiply. We make an assumption
+ that the input to the VEC_DUPLICATE is already on the FP & SIMD
+ side. This means costing of a MUL by element pre RA is a bit
+ optimistic. */
+ if (GET_CODE (op0) == VEC_DUPLICATE)
+ op0 = XEXP (op0, 0);
+ else if (GET_CODE (op1) == VEC_DUPLICATE)
+ op1 = XEXP (op1, 0);
+ }
+ }
/* Integer multiply/fma. */
if (GET_MODE_CLASS (mode) == MODE_INT)
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_float } */
+/* { dg-options "-Ofast" } */
+
+#include <arm_neon.h>
+
+void s_mult_i (int32_t* restrict res, int32_t* restrict a, int32_t b)
+{
+ for (int x = 0; x < 16; x++)
+ res[x] = a[x] * b;
+}
+
+void s_mult_f (float32_t* restrict res, float32_t* restrict a, float32_t b)
+{
+ for (int x = 0; x < 16; x++)
+ res[x] = a[x] * b;
+}
+
+/* { dg-final { scan-assembler-times {\s+mul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */
+/* { dg-final { scan-assembler-times {\s+fmul\tv[0-9]+\.4s, v[0-9]+\.4s, v[0-9]+\.s\[0\]} 4 } } */