Optimize memory broadcast for constant vector under AVX512.
authorliuhongt <hongtao.liu@intel.com>
Wed, 8 Jul 2020 09:14:36 +0000 (17:14 +0800)
committerliuhongt <hongtao.liu@intel.com>
Thu, 3 Sep 2020 08:10:45 +0000 (16:10 +0800)
For constant vector having one duplicated value, there's no need to put
whole vector in the constant pool, using embedded broadcast instead.

2020-07-09  Hongtao Liu  <hongtao.liu@intel.com>

gcc/ChangeLog:

PR target/87767
* config/i386/i386-features.c
(replace_constant_pool_with_broadcast): New function.
(constant_pool_broadcast): Ditto.
(class pass_constant_pool_broadcast): New pass.
(make_pass_constant_pool_broadcast): Ditto.
(remove_partial_avx_dependency): Call
replace_constant_pool_with_broadcast under TARGET_AVX512F, it
would save compile time when both pass rpad and cpb are
available.
(remove_partial_avx_dependency_gate): New function.
(class pass_remove_partial_avx_dependency::gate): Call
remove_partial_avx_dependency_gate.
* config/i386/i386-passes.def: Insert new pass after combine.
* config/i386/i386-protos.h
(make_pass_constant_pool_broadcast): Declare.
* config/i386/sse.md (*avx512dq_mul<mode>3<mask_name>_bcst):
New define_insn.
(*avx512f_mul<mode>3<mask_name>_bcst): Ditto.
* config/i386/avx512fintrin.h (_mm512_set1_ps,
_mm512_set1_pd,_mm512_set1_epi32, _mm512_set1_epi64): Adjusted.

gcc/testsuite/ChangeLog:

PR target/87767
* gcc.target/i386/avx2-broadcast-pr87767-1.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-1.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-2.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-3.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-4.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-5.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-6.c: New test.
* gcc.target/i386/avx512f-broadcast-pr87767-7.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-2.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-3.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-4.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: New test.
* gcc.target/i386/avx512vl-broadcast-pr87767-6.c: New test.

19 files changed:
gcc/config/i386/avx512fintrin.h
gcc/config/i386/i386-features.c
gcc/config/i386/i386-passes.def
gcc/config/i386/i386-protos.h
gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c [new file with mode: 0644]

index 0d53dda3a27b20b5117d06b6a1141265932cb0d7..729d5686d6870f0df98079e6df9188f669a1596f 100644 (file)
@@ -239,22 +239,17 @@ extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_pd (double __A)
 {
-  return (__m512d) __builtin_ia32_broadcastsd512 (__extension__
-                                                 (__v2df) { __A, },
-                                                 (__v8df)
-                                                 _mm512_undefined_pd (),
-                                                 (__mmask8) -1);
+  return __extension__ (__m512d)(__v8df)
+    { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_ps (float __A)
 {
-  return (__m512) __builtin_ia32_broadcastss512 (__extension__
-                                                (__v4sf) { __A, },
-                                                (__v16sf)
-                                                _mm512_undefined_ps (),
-                                                (__mmask16) -1);
+  return __extension__ (__m512)(__v16sf)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 /* Create the vector [A B C D A B C D A B C D A B C D].  */
@@ -4072,10 +4067,9 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi32 (int __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
-                                                          (__v16si)
-                                                          _mm512_undefined_epi32 (),
-                                                          (__mmask16)(-1));
+  return (__m512i)(__v16si)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
@@ -4128,10 +4122,7 @@ extern __inline __m512i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_set1_epi64 (long long __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
-                                                          (__v8di)
-                                                          _mm512_undefined_epi32 (),
-                                                          (__mmask8)(-1));
+  return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
 }
 
 extern __inline __m512i
index 535fc7e981da3c4414512643618679693a13e7ce..620f7f157f453a9f0d02e761e0e57d206d02b215 100644 (file)
@@ -2162,6 +2162,81 @@ make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+/* Replace all one-value const vector that are referenced by SYMBOL_REFs in x
+   with embedded broadcast. i.e.transform
+
+     vpaddq .LC0(%rip), %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+    .quad 3
+
+    to
+
+     vpaddq .LC0(%rip){1to8}, %zmm0, %zmm0
+     ret
+  .LC0:
+    .quad 3  */
+static void
+replace_constant_pool_with_broadcast (rtx_insn *insn)
+{
+  subrtx_ptr_iterator::array_type array;
+  FOR_EACH_SUBRTX_PTR (iter, array, &PATTERN (insn), ALL)
+    {
+      rtx *loc = *iter;
+      rtx x = *loc;
+      rtx broadcast_mem, vec_dup, constant, first;
+      machine_mode mode;
+
+      /* Constant pool.  */
+      if (!MEM_P (x)
+         || !SYMBOL_REF_P (XEXP (x, 0))
+         || !CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)))
+       continue;
+
+      /* Const vector.  */
+      mode = GET_MODE (x);
+      if (!VECTOR_MODE_P (mode))
+       return;
+      constant = get_pool_constant (XEXP (x, 0));
+      if (GET_CODE (constant) != CONST_VECTOR)
+       return;
+
+      /* There could be some rtx like
+        (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+        but with "*.LC1" refer to V2DI constant vector.  */
+      if (GET_MODE (constant) != mode)
+       {
+         constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+         if (constant == NULL_RTX || GET_CODE (constant) != CONST_VECTOR)
+           return;
+       }
+      first = XVECEXP (constant, 0, 0);
+
+      for (int i = 1; i < GET_MODE_NUNITS (mode); ++i)
+       {
+         rtx tmp = XVECEXP (constant, 0, i);
+         /* Vector duplicate value.  */
+         if (!rtx_equal_p (tmp, first))
+           return;
+       }
+
+      /* Replace with embedded broadcast.  */
+      broadcast_mem = force_const_mem (GET_MODE_INNER (mode), first);
+      vec_dup = gen_rtx_VEC_DUPLICATE (mode, broadcast_mem);
+      validate_change (insn, loc, vec_dup, 0);
+
+      /* At most 1 memory_operand in an insn.  */
+      return;
+    }
+}
+
 /* At entry of the nearest common dominator for basic blocks with
    conversions, generate a single
        vxorps %xmmN, %xmmN, %xmmN
@@ -2197,6 +2272,10 @@ remove_partial_avx_dependency (void)
          if (!NONDEBUG_INSN_P (insn))
            continue;
 
+         /* Handle AVX512 embedded broadcast here to save compile time.  */
+         if (TARGET_AVX512F)
+           replace_constant_pool_with_broadcast (insn);
+
          set = single_set (insn);
          if (!set)
            continue;
@@ -2333,6 +2412,16 @@ remove_partial_avx_dependency (void)
   return 0;
 }
 
+static bool
+remove_partial_avx_dependency_gate ()
+{
+  return (TARGET_AVX
+         && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+         && TARGET_SSE_MATH
+         && optimize
+         && optimize_function_for_speed_p (cfun));
+}
+
 namespace {
 
 const pass_data pass_data_remove_partial_avx_dependency =
@@ -2358,11 +2447,7 @@ public:
   /* opt_pass methods: */
   virtual bool gate (function *)
     {
-      return (TARGET_AVX
-             && TARGET_SSE_PARTIAL_REG_DEPENDENCY
-             && TARGET_SSE_MATH
-             && optimize
-             && optimize_function_for_speed_p (cfun));
+      return remove_partial_avx_dependency_gate ();
     }
 
   virtual unsigned int execute (function *)
@@ -2379,6 +2464,68 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
   return new pass_remove_partial_avx_dependency (ctxt);
 }
 
+/* For const vector having one duplicated value, there's no need to put
+   whole vector in the constant pool when target supports embedded broadcast. */
+static unsigned int
+constant_pool_broadcast (void)
+{
+  timevar_push (TV_MACH_DEP);
+  rtx_insn *insn;
+
+  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
+    {
+      if (INSN_P (insn))
+       replace_constant_pool_with_broadcast (insn);
+    }
+  timevar_pop (TV_MACH_DEP);
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_constant_pool_broadcast =
+{
+  RTL_PASS, /* type */
+  "cpb", /* name */
+  OPTGROUP_NONE, /* optinfo_flags */
+  TV_MACH_DEP, /* tv_id */
+  0, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  TODO_df_finish, /* todo_flags_finish */
+};
+
+class pass_constant_pool_broadcast : public rtl_opt_pass
+{
+public:
+  pass_constant_pool_broadcast (gcc::context *ctxt)
+    : rtl_opt_pass (pass_data_constant_pool_broadcast, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+    {
+      /* Return false if rpad pass gate is true.
+        replace_constant_pool_with_broadcast is called
+        from both this pass and rpad pass.  */
+      return (TARGET_AVX512F && !remove_partial_avx_dependency_gate ());
+    }
+
+  virtual unsigned int execute (function *)
+    {
+      return constant_pool_broadcast ();
+    }
+}; // class pass_cpb
+
+} // anon namespace
+
+rtl_opt_pass *
+make_pass_constant_pool_broadcast (gcc::context *ctxt)
+{
+  return new pass_constant_pool_broadcast (ctxt);
+}
+
 /* This compares the priority of target features in function DECL1
    and DECL2.  It returns positive value if DECL1 is higher priority,
    negative value if DECL2 is higher priority and 0 if they are the
index d83c7b956b1d2e9b96e3405f0062168e3c1c68e5..07ecf8e790f3093978d73e90755bc9c52c9e0428 100644 (file)
@@ -33,3 +33,4 @@ along with GCC; see the file COPYING3.  If not see
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
 
   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+  INSERT_PASS_AFTER (pass_combine, 1, pass_constant_pool_broadcast);
index b6088f22d55908378bcacef77dea6ab0e284cc45..c5b700efd0e26beb0e19ee563ba92da8406f58d5 100644 (file)
@@ -386,3 +386,4 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
   (gcc::context *);
 extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
   (gcc::context *);
+extern rtl_opt_pass *make_pass_constant_pool_broadcast (gcc::context *);
index 8250325e1a39bf07802044bfb965d290785e9d11..a728b979f01037174cdf61c97838e87211738ed5 100644 (file)
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512dq_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
+       (mult:VI8_AVX512VL
+         (vec_duplicate:VI8_AVX512VL
+           (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+         (match_operand:VI8_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512DQ"
+  "vpmullq\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3<mask_name>"
   [(set (match_operand:VI4_AVX512F 0 "register_operand")
        (mult:VI4_AVX512F
    (set_attr "btver2_decode" "vector,vector,vector")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "*avx512f_mul<mode>3<mask_name>_bcst"
+  [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
+       (mult:VI4_AVX512VL
+         (vec_duplicate:VI4_AVX512VL
+           (match_operand:<ssescalarmode> 1 "memory_operand" "m"))
+         (match_operand:VI4_AVX512VL 2 "register_operand" "v")))]
+  "TARGET_AVX512F"
+   "vpmulld\t{%1<avx512bcst>, %2, %0<mask_operand3>|%0<mask_operand3>, %2, %1<avx512bcst>}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_expand "mul<mode>3"
   [(set (match_operand:VI8_AVX2_AVX512F 0 "register_operand")
        (mult:VI8_AVX2_AVX512F
diff --git a/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx2-broadcast-pr87767-1.c
new file mode 100644 (file)
index 0000000..aee1680
--- /dev/null
@@ -0,0 +1,40 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "\\\{1to\[248\]\\\}" } }  */
+/* { dg-final { scan-assembler-not "\\\{1to16\\\}" } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define FOO(VTYPE, OP_NAME, OP)                        \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+foo_##OP_NAME##_##VTYPE (VTYPE a)              \
+{                                              \
+  return a OP 101;                             \
+}                                              \
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-1.c
new file mode 100644 (file)
index 0000000..a8ee5f5
--- /dev/null
@@ -0,0 +1,30 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 5 } }  */
+
+typedef int v16si  __attribute__ ((vector_size (64)));
+typedef long long v8di  __attribute__ ((vector_size (64)));
+typedef float v16sf  __attribute__ ((vector_size (64)));
+typedef double v8df  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP)                        \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+foo_##OP_NAME##_##VTYPE (VTYPE a)              \
+{                                              \
+  return a OP CONSTANT;                                \
+}                                              \
+
+FOO (v16si, add, +);
+FOO (v8di, add, +);
+FOO (v16sf, add, +);
+FOO (v8df, add, +);
+FOO (v16si, sub, -);
+FOO (v8di, sub, -);
+FOO (v16si, mul, *);
+FOO (v8di, mul, *);
+FOO (v16sf, mul, *);
+FOO (v8df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-2.c
new file mode 100644 (file)
index 0000000..30cf580
--- /dev/null
@@ -0,0 +1,42 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq" } */
+/* { dg-require-effective-target avx512dq } */
+
+#define AVX512DQ
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP)             \
+  do                                                   \
+    {                                                  \
+      TYPE exp[N], src[N];                             \
+      VTYPE res;                                       \
+      for (int i = 0; i < N; i++)                      \
+       src[i] = i * i * 107;                           \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);        \
+      for (int i = 0; i < N; i ++)                     \
+       exp[i] = src[i] OP CONSTANT;                    \
+      for (int j = 0; j < N; j++)                      \
+       {                                               \
+         if (res[j] != exp[j])                         \
+           abort();                                    \
+       }                                               \
+    }                                                  \
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16si, int, 16, add, +);
+  RTEST (v8di, long long, 8, add, +);
+  RTEST (v16sf, float, 16, add, +);
+  RTEST (v8df, double, 8, add, +);
+  RTEST (v16si, int, 16, sub, -);
+  RTEST (v8di, long long, 8, sub, -);
+  RTEST (v16si, int, 16, mul, *);
+  RTEST (v8di, long long, 8, mul, *);
+  RTEST (v16sf, float, 16, mul, *);
+  RTEST (v8df, double, 8, mul, *);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-3.c
new file mode 100644 (file)
index 0000000..c2f22c4
--- /dev/null
@@ -0,0 +1,30 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to16\\\}" 4 } }  */
+
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef float v16sf  __attribute__ ((vector_size (64)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+typedef double v8df  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)          \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b)    \
+{                                              \
+  return (OP1 a * b) OP2 CONSTANT;             \
+}                                              \
+
+FOO (v16sf, fma,, +);
+FOO (v8df, fma,, +);
+FOO (v16sf, fms,, -);
+FOO (v8df, fms,, -);
+FOO (v16sf, fnma, -, +);
+FOO (v8df, fnma, -, +);
+FOO (v16sf, fnms, -, -);
+FOO (v8df, fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-4.c
new file mode 100644 (file)
index 0000000..dabe91b
--- /dev/null
@@ -0,0 +1,42 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)                               \
+  do                                                                   \
+    {                                                                  \
+      TYPE exp[N], src1[N], src2[N];                                   \
+      VTYPE res;                                                       \
+      for (int i = 0; i < N; i++)                                      \
+       {                                                               \
+         src1[i] = i * i * 107.2f;                                     \
+         src2[i] = i * 2.f - 404.f;                                    \
+       }                                                               \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+      for (int i = 0; i < N; i ++)                                     \
+       exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT;                  \
+      for (int j = 0; j < N; j++)                                      \
+       {                                                               \
+         if (res[j] != exp[j])                                         \
+           abort();                                                    \
+       }                                                               \
+    }                                                                  \
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16sf, float, 16, fma,, +);
+  RTEST (v8df, double, 8, fma,, +);
+  RTEST (v16sf, float, 16, fms,, -);
+  RTEST (v8df, double, 8, fms,, -);
+  RTEST (v16sf, float, 16, fnma,-, +);
+  RTEST (v8df, double, 8, fnma,-, +);
+  RTEST (v16sf, float, 16, fnms,-, -);
+  RTEST (v8df, double, 8, fnms,-, -);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-5.c
new file mode 100644 (file)
index 0000000..72e1098
--- /dev/null
@@ -0,0 +1,26 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to8\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^n\n\]*\\\{1to16\\\}" 4 } }  */
+
+typedef int v16si  __attribute__ ((vector_size (64)));
+typedef long long v8di  __attribute__ ((vector_size (64)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)          \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+ foo_##OP_NAME##_##VTYPE (VTYPE a)             \
+{                                              \
+  return (OP1 a) OP2 CONSTANT;                 \
+}                                              \
+
+FOO (v16si, andnot, ~, &);
+FOO (v8di, andnot, ~, &);
+FOO (v16si, and,, &);
+FOO (v8di, and,, &);
+FOO (v16si, or,, |);
+FOO (v8di, or,, |);
+FOO (v16si, xor,, ^);
+FOO (v8di, xor,, ^);
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-6.c
new file mode 100644 (file)
index 0000000..f288f83
--- /dev/null
@@ -0,0 +1,41 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f} */
+
+#include "avx512f-helper.h"
+
+#include "avx512f-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)                               \
+  do                                                                   \
+    {                                                                  \
+      TYPE exp[N], src[N];                                             \
+      VTYPE res;                                                       \
+      for (int i = 0; i < N; i++)                                      \
+       {                                                               \
+         src[i] = i * i * 107;                                         \
+       }                                                               \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);                        \
+      for (int i = 0; i < N; i ++)                                     \
+       exp[i] = (OP1 src[i]) OP2 CONSTANT;                             \
+      for (int j = 0; j < N; j++)                                      \
+       {                                                               \
+         if (res[j] != exp[j])                                         \
+           abort();                                                    \
+       }                                                               \
+    }                                                                  \
+  while (0)
+
+void
+test_512 (void)
+{
+  RTEST (v16si, int, 16, andnot, ~, &);
+  RTEST (v8di, long long, 8, andnot, ~, &);
+  RTEST (v16si, int, 16, and,, &);
+  RTEST (v8di, long long, 8, and,, &);
+  RTEST (v16si, int, 16, or,, |);
+  RTEST (v8di, long long, 8, or,, |);
+  RTEST (v16si, int, 16, xor,, ^);
+  RTEST (v8di, long long, 8, xor,, ^);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c b/gcc/testsuite/gcc.target/i386/avx512f-broadcast-pr87767-7.c
new file mode 100644 (file)
index 0000000..a8f145d
--- /dev/null
@@ -0,0 +1,121 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vmul\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vdiv\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfmsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmadd\[^\n\]*\\\{1to16\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to2\\\}" 1 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to4\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to8\\\}" 2 } }  */
+/* { dg-final { scan-assembler-times "vfnmsub\[^\n\]*\\\{1to16\\\}" 1 } }  */
+
+#include<immintrin.h>
+
+#define CONSTANT 101
+
+#define FOO(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE)                        \
+  VTYPE                                                                        \
+  __attribute__ ((noipa))                                              \
+  _mm##LEN##_foo_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m)  \
+  {                                                                    \
+    return  _mm##LEN##_mask_##OP_NAME##_##SUFFIX (dst, m, src,         \
+                                                 _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+  }                                                                    \
+
+#define FOOZ(VTYPE, OP_NAME, LEN, SUFFIX, MTYPE)                       \
+  VTYPE                                                                        \
+  __attribute__ ((noipa))                                              \
+  _mm##LEN##_fooz_##OP_NAME##_##SUFFIX (VTYPE dst, VTYPE src, MTYPE m) \
+  {                                                                    \
+    return  _mm##LEN##_maskz_##OP_NAME##_##SUFFIX (m, dst, src,                \
+                                                 _mm##LEN##_set1_##SUFFIX (CONSTANT)); \
+  }                                                                    \
+
+FOO (__m512, add, 512, ps, __mmask16);
+FOO (__m256, add, 256, ps, __mmask8);
+FOO (__m128, add,, ps, __mmask8);
+
+FOO (__m512, sub, 512, ps, __mmask16);
+FOO (__m256, sub, 256, ps, __mmask8);
+FOO (__m128, sub,, ps, __mmask8);
+
+FOO (__m512, mul, 512, ps, __mmask16);
+FOO (__m256, mul, 256, ps, __mmask8);
+FOO (__m128, mul,, ps, __mmask8);
+
+FOO (__m512, div, 512, ps, __mmask16);
+FOO (__m256, div, 256, ps, __mmask8);
+FOO (__m128, div,, ps, __mmask8);
+
+FOOZ (__m512, fmadd, 512, ps, __mmask16);
+FOOZ (__m256, fmadd, 256, ps, __mmask8);
+FOOZ (__m128, fmadd,, ps, __mmask8);
+
+FOOZ (__m512, fmsub, 512, ps, __mmask16);
+FOOZ (__m256, fmsub, 256, ps, __mmask8);
+FOOZ (__m128, fmsub,, ps, __mmask8);
+
+FOOZ (__m512, fnmadd, 512, ps, __mmask16);
+FOOZ (__m256, fnmadd, 256, ps, __mmask8);
+FOOZ (__m128, fnmadd,, ps, __mmask8);
+
+FOOZ (__m512, fnmsub, 512, ps, __mmask16);
+FOOZ (__m256, fnmsub, 256, ps, __mmask8);
+FOOZ (__m128, fnmsub,, ps, __mmask8);
+
+FOO (__m512d, add, 512, pd, __mmask8);
+FOO (__m256d, add, 256, pd, __mmask8);
+FOO (__m128d, add,, pd, __mmask8);
+
+FOO (__m512d, sub, 512, pd, __mmask8);
+FOO (__m256d, sub, 256, pd, __mmask8);
+FOO (__m128d, sub,, pd, __mmask8);
+
+FOO (__m512d, mul, 512, pd, __mmask8);
+FOO (__m256d, mul, 256, pd, __mmask8);
+FOO (__m128d, mul,, pd, __mmask8);
+
+FOO (__m512d, div, 512, pd, __mmask8);
+FOO (__m256d, div, 256, pd, __mmask8);
+FOO (__m128d, div,, pd, __mmask8);
+
+FOOZ (__m512d, fmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fmsub,, pd, __mmask8);
+
+FOOZ (__m512d, fnmadd, 512, pd, __mmask8);
+FOOZ (__m256d, fnmadd, 256, pd, __mmask8);
+FOOZ (__m128d, fnmadd,, pd, __mmask8);
+
+FOOZ (__m512d, fnmsub, 512, pd, __mmask8);
+FOOZ (__m256d, fnmsub, 256, pd, __mmask8);
+FOOZ (__m128d, fnmsub,, pd, __mmask8);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-1.c
new file mode 100644 (file)
index 0000000..397e287
--- /dev/null
@@ -0,0 +1,45 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mavx512dq" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 5 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 10 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 5 } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP)                        \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+foo_##OP_NAME##_##VTYPE (VTYPE a)              \
+{                                              \
+  return a OP CONSTANT;                                \
+}                                              \
+
+FOO (v4si, add, +);
+FOO (v8si, add, +);
+FOO (v2di, add, +);
+FOO (v4di, add, +);
+FOO (v4si, sub, -);
+FOO (v8si, sub, -);
+FOO (v2di, sub, -);
+FOO (v4di, sub, -);
+FOO (v4sf, add, +);
+FOO (v8sf, add, +);
+FOO (v2df, add, +);
+FOO (v4df, add, +);
+FOO (v4si, mul, *);
+FOO (v8si, mul, *);
+FOO (v2di, mul, *);
+FOO (v4di, mul, *);
+FOO (v4sf, mul, *);
+FOO (v8sf, mul, *);
+FOO (v2df, mul, *);
+FOO (v4df, mul, *);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-2.c
new file mode 100644 (file)
index 0000000..9b796ac
--- /dev/null
@@ -0,0 +1,59 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512dq -mavx512vl" } */
+/* { dg-require-effective-target avx512dq } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512DQ
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-1.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP)             \
+  do                                                   \
+    {                                                  \
+      TYPE exp[N], src[N];                             \
+      VTYPE res;                                       \
+      for (int i = 0; i < N; i++)                      \
+       src[i] = i * i * 107;                           \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);        \
+      for (int i = 0; i < N; i ++)                     \
+       exp[i] = src[i] OP CONSTANT;                    \
+      for (int j = 0; j < N; j++)                      \
+       {                                               \
+         if (res[j] != exp[j])                         \
+           abort();                                    \
+       }                                               \
+    }                                                  \
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8si, int, 8, add, +);
+  RTEST (v4di, long long, 4, add, +);
+  RTEST (v8sf, float, 8, add, +);
+  RTEST (v4df, double, 4, add, +);
+  RTEST (v8si, int, 8, sub, -);
+  RTEST (v4di, long long, 4, sub, -);
+  RTEST (v8si, int, 8, mul, *);
+  RTEST (v4di, long long, 4, mul, *);
+  RTEST (v8sf, float, 8, mul, *);
+  RTEST (v4df, double, 4, mul, *);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4si, int, 4, add, +);
+  RTEST (v2di, long long, 2, add, +);
+  RTEST (v4sf, float, 4, add, +);
+  RTEST (v2df, double, 2, add, +);
+  RTEST (v4si, int, 4, sub, -);
+  RTEST (v2di, long long, 2, sub, -);
+  RTEST (v4si, int, 4, mul, *);
+  RTEST (v2di, long long, 2, mul, *);
+  RTEST (v4sf, float, 4, mul, *);
+  RTEST (v2df, double, 2, mul, *);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-3.c
new file mode 100644 (file)
index 0000000..aedfb16
--- /dev/null
@@ -0,0 +1,37 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+
+typedef float v4sf  __attribute__ ((vector_size (16)));
+typedef float v8sf  __attribute__ ((vector_size (32)));
+typedef double v2df  __attribute__ ((vector_size (16)));
+typedef double v4df  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)          \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+ foo_##OP_NAME##_##VTYPE (VTYPE a, VTYPE b)    \
+{                                              \
+  return (OP1 a * b) OP2 CONSTANT;             \
+}                                              \
+
+FOO (v4sf, fma,, +);
+FOO (v8sf, fma,, +);
+FOO (v2df, fma,, +);
+FOO (v4df, fma,, +);
+FOO (v4sf, fms,, -);
+FOO (v8sf, fms,, -);
+FOO (v2df, fms,, -);
+FOO (v4df, fms,, -);
+FOO (v4sf, fnma, -, +);
+FOO (v8sf, fnma, -, +);
+FOO (v2df, fnma, -, +);
+FOO (v4df, fnma, -, +);
+FOO (v4sf, fnms, -, -);
+FOO (v8sf, fnms, -, -);
+FOO (v2df, fnms, -, -);
+FOO (v4df, fnms, -, -);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-4.c
new file mode 100644 (file)
index 0000000..40b8eb9
--- /dev/null
@@ -0,0 +1,56 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-3.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)                               \
+  do                                                                   \
+    {                                                                  \
+      TYPE exp[N], src1[N], src2[N];                                   \
+      VTYPE res;                                                       \
+      for (int i = 0; i < N; i++)                                      \
+       {                                                               \
+         src1[i] = i * i * 107.2f;                                     \
+         src2[i] = i * 2.f - 404.f;                                    \
+       }                                                               \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src1[0], *(VTYPE*)&src2[0]); \
+      for (int i = 0; i < N; i ++)                                     \
+       exp[i] = (OP1 src1[i] * src2[i]) OP2 CONSTANT;                  \
+      for (int j = 0; j < N; j++)                                      \
+       {                                                               \
+         if (res[j] != exp[j])                                         \
+           abort();                                                    \
+       }                                                               \
+    }                                                                  \
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8sf, float, 8, fma,, +);
+  RTEST (v4df, double, 4, fma,, +);
+  RTEST (v8sf, float, 8, fms,, -);
+  RTEST (v4df, double, 4, fms,, -);
+  RTEST (v8sf, float, 8, fnma,-, +);
+  RTEST (v4df, double, 4, fnma,-, +);
+  RTEST (v8sf, float, 8, fnms,-, -);
+  RTEST (v4df, double, 4, fnms,-, -);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4sf, float, 4, fma,, +);
+  RTEST (v2df, double, 2, fma,, +);
+  RTEST (v4sf, float, 4, fms,, -);
+  RTEST (v2df, double, 2, fms,, -);
+  RTEST (v4sf, float, 4, fnma,-, +);
+  RTEST (v2df, double, 2, fnma,-, +);
+  RTEST (v4sf, float, 4, fnms,-, -);
+  RTEST (v2df, double, 2, fnms,-, -);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-5.c
new file mode 100644 (file)
index 0000000..1e9460f
--- /dev/null
@@ -0,0 +1,37 @@
+/* PR target/87767 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to2\\\}" 4 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to4\\\}" 8 } }  */
+/* { dg-final { scan-assembler-times "\[^\n\]*\\\{1to8\\\}" 4 } }  */
+
+typedef int v4si  __attribute__ ((vector_size (16)));
+typedef int v8si  __attribute__ ((vector_size (32)));
+typedef long long v2di  __attribute__ ((vector_size (16)));
+typedef long long v4di  __attribute__ ((vector_size (32)));
+
+#define CONSTANT 101;
+#define FOO(VTYPE, OP_NAME, OP1, OP2)          \
+VTYPE                                          \
+ __attribute__ ((noipa))                       \
+ foo_##OP_NAME##_##VTYPE (VTYPE a)             \
+{                                              \
+  return (OP1 a) OP2 CONSTANT;                 \
+}                                              \
+
+FOO (v4si, andnot, ~, &);
+FOO (v8si, andnot, ~, &);
+FOO (v2di, andnot, ~, &);
+FOO (v4di, andnot, ~, &);
+FOO (v4si, and,, &);
+FOO (v8si, and,, &);
+FOO (v2di, and,, &);
+FOO (v4di, and,, &);
+FOO (v4si, or,, |);
+FOO (v8si, or,, |);
+FOO (v2di, or,, |);
+FOO (v4di, or,, |);
+FOO (v4si, xor,, ^);
+FOO (v8si, xor,, ^);
+FOO (v2di, xor,, ^);
+FOO (v4di, xor,, ^);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c b/gcc/testsuite/gcc.target/i386/avx512vl-broadcast-pr87767-6.c
new file mode 100644 (file)
index 0000000..493a76f
--- /dev/null
@@ -0,0 +1,55 @@
+/* PR target/87767 */
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+#include "avx512vl-broadcast-pr87767-5.c"
+
+#define RTEST(VTYPE, TYPE, N, OP_NAME, OP1, OP2)                               \
+  do                                                                   \
+    {                                                                  \
+      TYPE exp[N], src[N];                                             \
+      VTYPE res;                                                       \
+      for (int i = 0; i < N; i++)                                      \
+       {                                                               \
+         src[i] = i * i * 107;                                         \
+       }                                                               \
+      res = foo_##OP_NAME##_##VTYPE (*(VTYPE*)&src[0]);                        \
+      for (int i = 0; i < N; i ++)                                     \
+       exp[i] = (OP1 src[i]) OP2 CONSTANT;                             \
+      for (int j = 0; j < N; j++)                                      \
+       {                                                               \
+         if (res[j] != exp[j])                                         \
+           abort();                                                    \
+       }                                                               \
+    }                                                                  \
+  while (0)
+
+void
+test_256 (void)
+{
+  RTEST (v8si, int, 8, andnot, ~, &);
+  RTEST (v4di, long long, 4, andnot, ~, &);
+  RTEST (v8si, int, 8, and,, &);
+  RTEST (v4di, long long, 4, and,, &);
+  RTEST (v8si, int, 8, or,, |);
+  RTEST (v4di, long long, 4, or,, |);
+  RTEST (v8si, int, 8, xor,, ^);
+  RTEST (v4di, long long, 4, xor,, ^);
+}
+
+void
+test_128 (void)
+{
+  RTEST (v4si, int, 4, andnot, ~, &);
+  RTEST (v2di, long long, 2, andnot, ~, &);
+  RTEST (v4si, int, 4, and,, &);
+  RTEST (v2di, long long, 2, and,, &);
+  RTEST (v4si, int, 4, or,, |);
+  RTEST (v2di, long long, 2, or,, |);
+  RTEST (v4si, int, 4, xor,, ^);
+  RTEST (v2di, long long, 2, xor,, ^);
+}