gcc_assert (ok);
}
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+ under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+ vpmovzxbw ymm2, xmm0
+ vpmovzxbw ymm3, xmm1
+ vpmullw ymm4, ymm2, ymm3
+ vpmovwb xmm0, ymm4
+
+ it would take less instructions than ix86_expand_vecop_qihi.
+ Return true if success. */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+ machine_mode himode, qimode = GET_MODE (dest);
+ rtx hop1, hop2, hdest;
+ rtx (*gen_extend)(rtx, rtx);
+ rtx (*gen_truncate)(rtx, rtx);
+
+ /* There's no V64HImode multiplication instruction. */
+ if (qimode == E_V64QImode)
+ return false;
+
+ /* vpmovwb only available under AVX512BW. */
+ if (!TARGET_AVX512BW)
+ return false;
+ if ((qimode == V8QImode || qimode == V16QImode)
+ && !TARGET_AVX512VL)
+ return false;
+ /* Not generate zmm instruction when prefer 128/256 bit vector width. */
+ if (qimode == V32QImode
+ && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+ return false;
+
+ switch (qimode)
+ {
+ case E_V8QImode:
+ himode = V8HImode;
+ gen_extend = gen_zero_extendv8qiv8hi2;
+ gen_truncate = gen_truncv8hiv8qi2;
+ break;
+ case E_V16QImode:
+ himode = V16HImode;
+ gen_extend = gen_zero_extendv16qiv16hi2;
+ gen_truncate = gen_truncv16hiv16qi2;
+ break;
+ case E_V32QImode:
+ himode = V32HImode;
+ gen_extend = gen_zero_extendv32qiv32hi2;
+ gen_truncate = gen_truncv32hiv32qi2;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
+ hop1 = gen_reg_rtx (himode);
+ hop2 = gen_reg_rtx (himode);
+ hdest = gen_reg_rtx (himode);
+ emit_insn (gen_extend (hop1, op1));
+ emit_insn (gen_extend (hop2, op2));
+ emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+ hop1, hop2)));
+ emit_insn (gen_truncate (dest, hdest));
+ return true;
+}
/* Expand a vector operation CODE for a V*QImode in terms of the
same operation on V*HImode. */
extern void ix86_expand_rounddf_32 (rtx, rtx);
extern void ix86_expand_round_sse4 (rtx, rtx);
+extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
extern rtx ix86_split_stack_guard (void);
(set_attr "prefix" "orig,maybe_evex")
(set_attr "mode" "TI")])
-(define_expand "mul<mode>3<mask_name>"
+(define_expand "mulv8qi3"
+ [(set (match_operand:V8QI 0 "register_operand")
+ (mult:V8QI (match_operand:V8QI 1 "register_operand")
+ (match_operand:V8QI 2 "register_operand")))]
+ "TARGET_AVX512VL && TARGET_AVX512BW"
+{
+ gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "mul<mode>3"
[(set (match_operand:VI1_AVX512 0 "register_operand")
(mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:VI1_AVX512 2 "register_operand")))]
- "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+ "TARGET_SSE2"
{
+ if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]))
+ DONE;
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
DONE;
})
--- /dev/null
+/* PR target/95488 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 2 } } */
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+mul_512 (v32qi a, v32qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v32uqi
+umul_512 (v32uqi a, v32uqi b)
+{
+ return a * b;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn) \
+do \
+ { \
+ typeV v1, v2, res; \
+ typeS s1[N], s2[N], exp[N]; \
+ int i,j; \
+ \
+ for (i = 0; i < N; i++) \
+ { \
+ s1[i] = i * i; \
+ s2[i] = i + 20; \
+ } \
+ for (i = 0; i < N; i++) \
+ exp[i] = s1[i] * s2[i]; \
+ v1 = *(typeV *)&s1[0]; \
+ v2 = *(typeV *)&s2[0]; \
+ res = fn (v1, v2); \
+ for (j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+} \
+while (0)
+
+static void
+TEST (void)
+{
+ TEST_MULB (v32qi, char, 32, mul_512);
+ TEST_MULB (v32uqi, unsigned char, 32, umul_512);
+}
--- /dev/null
+/* PR target/pr95488 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+typedef unsigned char v8uqi __attribute__ ((vector_size (8)));
+
+__attribute__((noipa)) v8qi
+mul_128 (v8qi a, v8qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v16qi
+mul_256 (v16qi a, v16qi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v8uqi
+umul_128 (v8uqi a, v8uqi b)
+{
+ return a * b;
+}
+
+__attribute__((noipa)) v16uqi
+umul_256 (v16uqi a, v16uqi b)
+{
+ return a * b;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512vl-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn) \
+do \
+ { \
+ typeV v1, v2, res; \
+ int i,j; \
+ typeS s1[N], s2[N], exp[N]; \
+ \
+ for (i = 0; i < N; i++) \
+ { \
+ s1[i] = i * i; \
+ s2[i] = i + 20; \
+ } \
+ for (i = 0; i < N; i++) \
+ exp[i] = s1[i] * s2[i]; \
+ v1 = *(typeV *)s1; \
+ v2 = *(typeV *)s2; \
+ res = fn (v1, v2); \
+ for (j = 0; j < N; j++) \
+ { \
+ if (res[j] != exp[j]) \
+ abort(); \
+ } \
+ } \
+while (0)
+
+static void
+TEST (void)
+{
+ TEST_MULB(v8qi, char, 8, mul_128);
+ TEST_MULB(v8uqi, unsigned char, 8, umul_128);
+ TEST_MULB(v16qi, char, 16, mul_256);
+ TEST_MULB(v16uqi, unsigned char, 16, umul_256);
+}