__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_pd (double __A)
{
- return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
- (__v2df) { __A, },
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return __extension__ (__m512d)(__v8df)
+ { __A, __A, __A, __A, __A, __A, __A, __A };
}
extern __inline __m512
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_ps (float __A)
{
- return (__m512) __builtin_ia32_broadcastss512 (__extension__
- (__v4sf) { __A, },
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1);
+ return __extension__ (__m512)(__v16sf)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
}
/* Create the vector [A B C D A B C D A B C D A B C D]. */
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi32 (int __A)
{
- return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
- (__v16si)
- _mm512_undefined_epi32 (),
- (__mmask16)(-1));
+ return (__m512i)(__v16si)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_set1_epi64 (long long __A)
{
- return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
- (__v8di)
- _mm512_undefined_epi32 (),
- (__mmask8)(-1));
+ return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
}
extern __inline __m512i
return new pass_insert_endbr_and_patchable_area (ctxt);
}
+/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
+ with embedded broadcast. i.e.transform
+
+ vpaddq .LC0(%rip), %zmm0, %zmm0
+ ret
+ .LC0:
+ .quad 3
+ .quad 3
+ .quad 3
+ .quad 3
+ .quad 3
+ .quad 3
+ .quad 3
+ .quad 3
+
+ to
+
+ vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
+ ret
+ .LC0:
+ .quad 3 */
+static void
+replace_constant_pool_with_broadcast (rtx_insn *insn)
+{
+ subrtx_ptr_iterator::array_type array;
+ FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
+ {
+ rtx *loc = *iter;
+ rtx x = *loc;
+ rtx broadcast_mem, vec_dup, constant, first;
+ machine_mode mode;
+
+ /* Constant pool. */
+ if (!MEM_P (x)
+ || !SYMBOL_REF_P (XEXP (x, 0))
+ || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
+ continue;
+
+ /* Const vector. */
+ mode = GET_MODE (x);
+ if (!VECTOR_MODE_P (mode))
+ return;
+ constant = get_pool_constant (XEXP (x, 0));
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+ if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
+ return;
+ }
+ first = XVECEXP (constant, 0, 0);
+
+ for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
+ {
+ rtx tmp = XVECEXP (constant, 0, i);
+ /* Vector duplicate value. */
+ if (!rtx_equal_p (tmp, first))
+ return;
+ }
+
+ /* Replace with embedded broadcast. */
+ broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
+ vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
+ validate_change (insn, loc, vec_dup, 0);
+
+ /* At most 1 memory_operand in an insn. */
+ return;
+ }
+}
+
/* At entry of the nearest common dominator for basic blocks with
conversions, generate a single
vxorps %xmmN, %xmmN, %xmmN
if (!NONDEBUG_INSN_P (insn))
continue;
+ /* Handle AVX512 embedded broadcast here to save compile time. */
+ if (TARGET_AVX512F)
+ replace_constant_pool_with_broadcast (insn);
+
set = single_set (insn);
if (!set)
continue;
return 0;
}
+static bool
+remove_partial_avx_dependency_gate ()
+{
+ return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
namespace {
const pass_data pass_data_remove_partial_avx_dependency =
/* opt_pass methods: */
virtual bool gate (function *)
{
- return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+ return remove_partial_avx_dependency_gate ();
}
virtual unsigned int execute (function *)
return new pass_remove_partial_avx_dependency (ctxt);
}
+/* For const vector having one duplicated value, there's no need to put
+ whole vector in the constant pool when target supports embedded broadcast. */
+static unsigned int
+constant_pool_broadcast (void)
+{
+ timevar_push (TV_MACH_DEP);
+ rtx_insn *insn;
+
+ for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+ {
+ if (INSN_P (insn))
+ replace_constant_pool_with_broadcast (insn);
+ }
+ timevar_pop (TV_MACH_DEP);
+ return 0;
+}
+
+namespace {
+
+const pass_data pass_data_constant_pool_broadcast =
+{
+ RTL_PASS, /* type */
+ "cpb", /* name */
+ OPTGROUP_NONE, /* optinfo_flags */
+ TV_MACH_DEP, /* tv_id */
+ 0, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_constant_pool_broadcast : public rtl_opt_pass
+{
+public:
+ pass_constant_pool_broadcast (gcc::context *ctxt)
+ : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *)
+ {
+ /* Return false if rpad pass gate is true.
+ replace_constant_pool_with_broadcast is called
+ from both this pass and rpad pass. */
+ return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
+ }
+
+ virtual unsigned int execute (function *)
+ {
+ return constant_pool_broadcast ();
+ }
+}; // class pass_cpb
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_constant_pool_broadcast (gcc::context *ctxt)
+{
+ return new pass_constant_pool_broadcast (ctxt);
+}
+
/* This compares the priority of target features in function DECL1
and DECL2. It returns positive value if DECL1 is higher priority,
negative value if DECL2 is higher priority and 0 if they are the
INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+ INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
(gcc::context *);
extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
(gcc::context *);
+extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*avx512dq_mul<mode>3<mask_name>_bcst"
+ [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
+ (mult:VI8_AVX512VL
+ (vec_duplicate:VI8_AVX512VL
+ (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+ (match_operand:VI8_AVX512VL 2 "register_operand" "v")))]
+ "TARGET_AVX512DQ"
+ "vpmullq\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+ [(set_attr "type" "sseimul")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "mul<mode>3<mask_name>"
[(set (match_operand:VI4_AVX512F 0 "register_operand")
(mult:VI4_AVX512F
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn "*avx512f_mul<mode>3<mask_name>_bcst"
+ [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
+ (mult:VI4_AVX512VL
+ (vec_duplicate:VI4_AVX512VL
+ (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+ (match_operand:VI4_AVX512VL 2 "register_operand" "v")))]
+ "TARGET_AVX512F"
+ "vpmulld\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+ [(set_attr "type" "sseimul")
+ (set_attr "prefix" "evex")
+ (set_attr "mode" "<sseinsnmode>")])
+
(define_expand "mul<mode>3"
[(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
(mult:VI8_AVX2_AVX512F
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "\\\{1to\[248\]\\\}" } } */
+/* { dg-final { scan-assembler-not "\\\{1to16\\\}" } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef long long v4di __attribute__ ((vector_size (32)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+
+#define FOO(VTYPE, OP_NAME, OP) \
+VTYPE \
+ __attribute__ ((noipa)) \
+foo_##OP_NAME##_##VTYPE (VTYPE a) \
+{ \
+ return a OP 101; \
+} \
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 5 } } */
+
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__ ((vector_size (64)));
+typedef float v16sf __attribute__ ((vector_size (64)));
+typedef double v8df __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP) \
+VTYPE \
+ __attribute__ ((noipa)) \
+foo_##OP_NAME##_##VTYPE (VTYPE a) \
+{ \
+ return a OP CONSTANT; \
+} \
+
+FOO (v16si, add, +);
+FOO (v8di, add, +);
+FOO (v16sf, add, +);
+FOO (v8df, add, +);
+FOO (v16si, sub, -);
+FOO (v8di, sub, -);
+FOO (v16si, mul, *);
+FOO (v8di, mul, *);
+FOO (v16sf, mul, *);
+FOO (v8df, mul, *);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-require-effective-target avx512dq } */
+
+#define AVX512DQ
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP) \
+ do \
+ { \
+ TYPE exp[N], src[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ src[i] = i * i * 107; \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = src[i] OP CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_512 (void)
+{
+ RTEST (v16si, int, 16, add, +);
+ RTEST (v8di, long long, 8, add, +);
+ RTEST (v16sf, float, 16, add, +);
+ RTEST (v8df, double, 8, add, +);
+ RTEST (v16si, int, 16, sub, -);
+ RTEST (v8di, long long, 8, sub, -);
+ RTEST (v16si, int, 16, mul, *);
+ RTEST (v8di, long long, 8, mul, *);
+ RTEST (v16sf, float, 16, mul, *);
+ RTEST (v8df, double, 8, mul, *);
+}
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 4 } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef float v16sf __attribute__ ((vector_size (64)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+typedef double v8df __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2) \
+VTYPE \
+ __attribute__ ((noipa)) \
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b) \
+{ \
+ return (OP1 a * b) OP2 CONSTANT; \
+} \
+
+FOO (v16sf, fma,, +);
+FOO (v8df, fma,, +);
+FOO (v16sf, fms,, -);
+FOO (v8df, fms,, -);
+FOO (v16sf, fnma, -, +);
+FOO (v8df, fnma, -, +);
+FOO (v16sf, fnms, -, -);
+FOO (v8df, fnms, -, -);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2) \
+ do \
+ { \
+ TYPE exp[N], src1[N], src2[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ { \
+ src1[i] = i * i * 107.2f; \
+ src2[i] = i * 2.f - 404.f; \
+ } \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_512 (void)
+{
+ RTEST (v16sf, float, 16, fma,, +);
+ RTEST (v8df, double, 8, fma,, +);
+ RTEST (v16sf, float, 16, fms,, -);
+ RTEST (v8df, double, 8, fms,, -);
+ RTEST (v16sf, float, 16, fnma,-, +);
+ RTEST (v8df, double, 8, fnma,-, +);
+ RTEST (v16sf, float, 16, fnms,-, -);
+ RTEST (v8df, double, 8, fnms,-, -);
+}
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to8\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to16\\\}" 4 } } */
+
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2) \
+VTYPE \
+ __attribute__ ((noipa)) \
+ foo_##OP_NAME##_##VTYPE (VTYPE a) \
+{ \
+ return (OP1 a) OP2 CONSTANT; \
+} \
+
+FOO (v16si, andnot, ~, &);
+FOO (v8di, andnot, ~, &);
+FOO (v16si, and,, &);
+FOO (v8di, and,, &);
+FOO (v16si, or,, |);
+FOO (v8di, or,, |);
+FOO (v16si, xor,, ^);
+FOO (v8di, xor,, ^);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f} */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2) \
+ do \
+ { \
+ TYPE exp[N], src[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ { \
+ src[i] = i * i * 107; \
+ } \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = (OP1 src[i]) OP2 CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_512 (void)
+{
+ RTEST (v16si, int, 16, andnot, ~, &);
+ RTEST (v8di, long long, 8, andnot, ~, &);
+ RTEST (v16si, int, 16, and,, &);
+ RTEST (v8di, long long, 8, and,, &);
+ RTEST (v16si, int, 16, or,, |);
+ RTEST (v8di, long long, 8, or,, |);
+ RTEST (v16si, int, 16, xor,, ^);
+ RTEST (v8di, long long, 8, xor,, ^);
+}
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to16\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to2\\\}" 1 } } */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to4\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to8\\\}" 2 } } */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to16\\\}" 1 } } */
+
+#include<immintrin.h>
+
+#define CONSTANT 101
+
+#define FOO(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE) \
+ VTYPE \
+ __attribute__ ((noipa)) \
+ _mm##LEN##_foo_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m) \
+ { \
+ return _mm##LEN##_mask_##OP_NAME##_##SUFFIX (dst, m, src, \
+ _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+ } \
+
+#define FOOZ(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE) \
+ VTYPE \
+ __attribute__ ((noipa)) \
+ _mm##LEN##_fooz_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m) \
+ { \
+ return _mm##LEN##_maskz_##OP_NAME##_##SUFFIX (m, dst, src, \
+ _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+ } \
+
+FOO (__m512, add, 512, ps, __mmask16);
+FOO (__m256, add, 256, ps, __mmask8);
+FOO (__m128, add,, ps, __mmask8);
+
+FOO (__m512, sub, 512, ps, __mmask16);
+FOO (__m256, sub, 256, ps, __mmask8);
+FOO (__m128, sub,, ps, __mmask8);
+
+FOO (__m512, mul, 512, ps, __mmask16);
+FOO (__m256, mul, 256, ps, __mmask8);
+FOO (__m128, mul,, ps, __mmask8);
+
+FOO (__m512, div, 512, ps, __mmask16);
+FOO (__m256, div, 256, ps, __mmask8);
+FOO (__m128, div,, ps, __mmask8);
+
+FOOZ (__m512, fmadd, 512, ps, __mmask16);
+FOOZ (__m256, fmadd, 256, ps, __mmask8);
+FOOZ (__m128, fmadd,, ps, __mmask8);
+
+FOOZ (__m512, fmsub, 512, ps, __mmask16);
+FOOZ (__m256, fmsub, 256, ps, __mmask8);
+FOOZ (__m128, fmsub,, ps, __mmask8);
+
+FOOZ (__m512, fnmadd, 512, ps, __mmask16);
+FOOZ (__m256, fnmadd, 256, ps, __mmask8);
+FOOZ (__m128, fnmadd,, ps, __mmask8);
+
+FOOZ (__m512, fnmsub, 512, ps, __mmask16);
+FOOZ (__m256, fnmsub, 256, ps, __mmask8);
+FOOZ (__m128, fnmsub,, ps, __mmask8);
+
+FOO (__m512d, add, 512, pd, __mmask8);
+FOO (__m256d, add, 256, pd, __mmask8);
+FOO (__m128d, add,, pd, __mmask8);
+
+FOO (__m512d, sub, 512, pd, __mmask8);
+FOO (__m256d, sub, 256, pd, __mmask8);
+FOO (__m128d, sub,, pd, __mmask8);
+
+FOO (__m512d, mul, 512, pd, __mmask8);
+FOO (__m256d, mul, 256, pd, __mmask8);
+FOO (__m128d, mul,, pd, __mmask8);
+
+FOO (__m512d, div, 512, pd, __mmask8);
+FOO (__m256d, div, 256, pd, __mmask8);
+FOO (__m128d, div,, pd, __mmask8);
+
+FOOZ (__m512d, fmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fmsub,, pd, __mmask8);
+
+FOOZ (__m512d, fnmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fnmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fnmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fnmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fnmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fnmsub,, pd, __mmask8);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 5 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 10 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef long long v4di __attribute__ ((vector_size (32)));
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP) \
+VTYPE \
+ __attribute__ ((noipa)) \
+foo_##OP_NAME##_##VTYPE (VTYPE a) \
+{ \
+ return a OP CONSTANT; \
+} \
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4si, sub, -);
+FOO (v8si, sub, -);
+FOO (v2di, sub, -);
+FOO (v4di, sub, -);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq -mavx512vl" } */
+/* { dg-require-effective-target avx512dq } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512DQ
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP) \
+ do \
+ { \
+ TYPE exp[N], src[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ src[i] = i * i * 107; \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = src[i] OP CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_256 (void)
+{
+ RTEST (v8si, int, 8, add, +);
+ RTEST (v4di, long long, 4, add, +);
+ RTEST (v8sf, float, 8, add, +);
+ RTEST (v4df, double, 4, add, +);
+ RTEST (v8si, int, 8, sub, -);
+ RTEST (v4di, long long, 4, sub, -);
+ RTEST (v8si, int, 8, mul, *);
+ RTEST (v4di, long long, 4, mul, *);
+ RTEST (v8sf, float, 8, mul, *);
+ RTEST (v4df, double, 4, mul, *);
+}
+
+void
+test_128 (void)
+{
+ RTEST (v4si, int, 4, add, +);
+ RTEST (v2di, long long, 2, add, +);
+ RTEST (v4sf, float, 4, add, +);
+ RTEST (v2df, double, 2, add, +);
+ RTEST (v4si, int, 4, sub, -);
+ RTEST (v2di, long long, 2, sub, -);
+ RTEST (v4si, int, 4, mul, *);
+ RTEST (v2di, long long, 2, mul, *);
+ RTEST (v4sf, float, 4, mul, *);
+ RTEST (v2df, double, 2, mul, *);
+}
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } } */
+
+typedef float v4sf __attribute__ ((vector_size (16)));
+typedef float v8sf __attribute__ ((vector_size (32)));
+typedef double v2df __attribute__ ((vector_size (16)));
+typedef double v4df __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2) \
+VTYPE \
+ __attribute__ ((noipa)) \
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b) \
+{ \
+ return (OP1 a * b) OP2 CONSTANT; \
+} \
+
+FOO (v4sf, fma,, +);
+FOO (v8sf, fma,, +);
+FOO (v2df, fma,, +);
+FOO (v4df, fma,, +);
+FOO (v4sf, fms,, -);
+FOO (v8sf, fms,, -);
+FOO (v2df, fms,, -);
+FOO (v4df, fms,, -);
+FOO (v4sf, fnma, -, +);
+FOO (v8sf, fnma, -, +);
+FOO (v2df, fnma, -, +);
+FOO (v4df, fnma, -, +);
+FOO (v4sf, fnms, -, -);
+FOO (v8sf, fnms, -, -);
+FOO (v2df, fnms, -, -);
+FOO (v4df, fnms, -, -);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2) \
+ do \
+ { \
+ TYPE exp[N], src1[N], src2[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ { \
+ src1[i] = i * i * 107.2f; \
+ src2[i] = i * 2.f - 404.f; \
+ } \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_256 (void)
+{
+ RTEST (v8sf, float, 8, fma,, +);
+ RTEST (v4df, double, 4, fma,, +);
+ RTEST (v8sf, float, 8, fms,, -);
+ RTEST (v4df, double, 4, fms,, -);
+ RTEST (v8sf, float, 8, fnma,-, +);
+ RTEST (v4df, double, 4, fnma,-, +);
+ RTEST (v8sf, float, 8, fnms,-, -);
+ RTEST (v4df, double, 4, fnms,-, -);
+}
+
+void
+test_128 (void)
+{
+ RTEST (v4sf, float, 4, fma,, +);
+ RTEST (v2df, double, 2, fma,, +);
+ RTEST (v4sf, float, 4, fms,, -);
+ RTEST (v2df, double, 2, fms,, -);
+ RTEST (v4sf, float, 4, fnma,-, +);
+ RTEST (v2df, double, 2, fnma,-, +);
+ RTEST (v4sf, float, 4, fnms,-, -);
+ RTEST (v2df, double, 2, fnms,-, -);
+}
--- /dev/null
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v2di __attribute__ ((vector_size (16)));
+typedef long long v4di __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2) \
+VTYPE \
+ __attribute__ ((noipa)) \
+ foo_##OP_NAME##_##VTYPE (VTYPE a) \
+{ \
+ return (OP1 a) OP2 CONSTANT; \
+} \
+
+FOO (v4si, andnot, ~, &);
+FOO (v8si, andnot, ~, &);
+FOO (v2di, andnot, ~, &);
+FOO (v4di, andnot, ~, &);
+FOO (v4si, and,, &);
+FOO (v8si, and,, &);
+FOO (v2di, and,, &);
+FOO (v4di, and,, &);
+FOO (v4si, or,, |);
+FOO (v8si, or,, |);
+FOO (v2di, or,, |);
+FOO (v4di, or,, |);
+FOO (v4si, xor,, ^);
+FOO (v8si, xor,, ^);
+FOO (v2di, xor,, ^);
+FOO (v4di, xor,, ^);
--- /dev/null
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2) \
+ do \
+ { \
+ TYPE exp[N], src[N]; \
+ VTYPE res; \
+ for (int i = 0; i < N; i++) \
+ { \
+ src[i] = i * i * 107; \
+ } \
+ res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]); \
+ for (int i = 0; i < N; i ++) \
+ exp[i] = (OP1 src[i]) OP2 CONSTANT; \
+ for (int j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+ while (0)
+
+void
+test_256 (void)
+{
+ RTEST (v8si, int, 8, andnot, ~, &);
+ RTEST (v4di, long long, 4, andnot, ~, &);
+ RTEST (v8si, int, 8, and,, &);
+ RTEST (v4di, long long, 4, and,, &);
+ RTEST (v8si, int, 8, or,, |);
+ RTEST (v4di, long long, 4, or,, |);
+ RTEST (v8si, int, 8, xor,, ^);
+ RTEST (v4di, long long, 4, xor,, ^);
+}
+
+void
+test_128 (void)
+{
+ RTEST (v4si, int, 4, andnot, ~, &);
+ RTEST (v2di, long long, 2, andnot, ~, &);
+ RTEST (v4si, int, 4, and,, &);
+ RTEST (v2di, long long, 2, and,, &);
+ RTEST (v4si, int, 4, or,, |);
+ RTEST (v2di, long long, 2, or,, |);
+ RTEST (v4si, int, 4, xor,, ^);
+ RTEST (v2di, long long, 2, xor,, ^);
+}