Optimize multiplication for V8QI,V16QI,V32QI under TARGET_AVX512BW.
authorliuhongt <hongtao.liu@intel.com>
Wed, 3 Jun 2020 09:25:47 +0000 (17:25 +0800)
committerliuhongt <hongtao.liu@intel.com>
Mon, 15 Jun 2020 01:43:15 +0000 (09:43 +0800)
2020-06-13   Hongtao Liu  <hongtao.liu@intel.com>

gcc/ChangeLog:
PR target/95488
* config/i386/i386-expand.c (ix86_expand_vecmul_qihi): New
function.
* config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Declare.
* config/i386/sse.md (mul<mode>3): Drop mask_name since
there's no real simd int8 multiplication instruction with
mask. Also optimize it under TARGET_AVX512BW.
(mulv8qi3): New expander.

gcc/testsuite/ChangeLog:
* gcc.target/i386/avx512bw-pr95488-1.c: New test.
* gcc.target/i386/avx512bw-pr95488-2.c: Ditto.
* gcc.target/i386/avx512vl-pr95488-1.c: Ditto.
* gcc.target/i386/avx512vl-pr95488-2.c: Ditto.

gcc/config/i386/i386-expand.c
gcc/config/i386/i386-protos.h
gcc/config/i386/sse.md
gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c [new file with mode: 0644]

index 270585decb2cb951d6cc32048d2fb0e22f19b21b..3a414f69b3b5c845c0470c83579db5291fdd0f25 100644 (file)
@@ -19466,6 +19466,71 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   gcc_assert (ok);
 }
 
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+   under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  rtx hop1, hop2, hdest;
+  rtx (*gen_extend)(rtx, rtx);
+  rtx (*gen_truncate)(rtx, rtx);
+
+  /* There's no V64HImode multiplication instruction.  */
+  if (qimode == E_V64QImode)
+    return false;
+
+  /* vpmovwb only available under AVX512BW.  */
+  if (!TARGET_AVX512BW)
+    return false;
+  if ((qimode == V8QImode || qimode == V16QImode)
+      && !TARGET_AVX512VL)
+    return false;
+  /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
+  if (qimode == V32QImode
+      && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V8QImode:
+      himode = V8HImode;
+      gen_extend = gen_zero_extendv8qiv8hi2;
+      gen_truncate = gen_truncv8hiv8qi2;
+      break;
+    case E_V16QImode:
+      himode = V16HImode;
+      gen_extend = gen_zero_extendv16qiv16hi2;
+      gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_extend = gen_zero_extendv32qiv32hi2;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  hop1 = gen_reg_rtx (himode);
+  hop2 = gen_reg_rtx (himode);
+  hdest = gen_reg_rtx (himode);
+  emit_insn (gen_extend (hop1, op1));
+  emit_insn (gen_extend (hop2, op2));
+  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+                                                     hop1, hop2)));
+  emit_insn (gen_truncate (dest, hdest));
+  return true;
+}
 
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
index e5574496bb7ed5591f4488178eab5ae7b73318cb..f5320494fa1e3414a4dc4f347fc50e6f19118d5d 100644 (file)
@@ -204,6 +204,7 @@ extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_round_sse4 (rtx, rtx);
 
+extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
 
 extern rtx ix86_split_stack_guard (void);
index 7815d77bcbfdbe318fefd974e881c3b6db9a6171..aa9fdc87c682e6cabc800413843a2ce8f79cbde2 100644 (file)
    (set_attr "prefix" "orig,maybe_evex")
    (set_attr "mode" "TI")])
 
-(define_expand "mul<mode>3<mask_name>"
+(define_expand "mulv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+       (mult:V8QI (match_operand:V8QI 1 "register_operand")
+                  (match_operand:V8QI 2 "register_operand")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+{
+  gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "mul<mode>3"
   [(set (match_operand:VI1_AVX512 0 "register_operand")
        (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
                       (match_operand:VI1_AVX512 2 "register_operand")))]
-  "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+  "TARGET_SSE2"
 {
+  if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]))
+    DONE;
   ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
   DONE;
 })
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
new file mode 100644 (file)
index 0000000..594e511
--- /dev/null
@@ -0,0 +1,21 @@
+/* PR target/95488  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" }  */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 2 } } */
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+mul_512 (v32qi a, v32qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v32uqi
+umul_512 (v32uqi a, v32uqi b)
+{
+  return  a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
new file mode 100644 (file)
index 0000000..de31966
--- /dev/null
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn)         \
+do                                             \
+  {                                            \
+    typeV v1, v2, res;                         \
+    typeS s1[N], s2[N], exp[N];                \
+    int i,j;                                   \
+                                               \
+    for (i = 0; i < N; i++)                    \
+      {                                        \
+       s1[i] = i * i;                          \
+       s2[i] = i + 20;                 \
+      }                                        \
+    for (i = 0; i < N; i++)                    \
+      exp[i] = s1[i] * s2[i];                  \
+    v1 = *(typeV *)&s1[0];                     \
+    v2 = *(typeV *)&s2[0];                     \
+    res = fn (v1, v2);                         \
+    for (j = 0; j < N; j++)                    \
+      {                                        \
+       if (res[j] != exp[j])                   \
+         abort();                              \
+      }                                        \
+}                                              \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_MULB (v32qi, char, 32, mul_512);
+  TEST_MULB (v32uqi, unsigned char, 32, umul_512);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
new file mode 100644 (file)
index 0000000..b3674fb
--- /dev/null
@@ -0,0 +1,36 @@
+/* PR target/pr95488  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" }  */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+typedef unsigned char v8uqi __attribute__ ((vector_size (8)));
+
+__attribute__((noipa)) v8qi
+mul_128 (v8qi a, v8qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v16qi
+mul_256 (v16qi a, v16qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v8uqi
+umul_128 (v8uqi a, v8uqi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v16uqi
+umul_256 (v16uqi a, v16uqi b)
+{
+  return  a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c
new file mode 100644 (file)
index 0000000..45d7437
--- /dev/null
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512vl-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn)         \
+do                                             \
+  {                                            \
+    typeV v1, v2, res;                         \
+    int i,j;                                   \
+    typeS s1[N], s2[N], exp[N];                \
+                                               \
+    for (i = 0; i < N; i++)                    \
+      {                                        \
+       s1[i] = i * i;                          \
+       s2[i] = i + 20;                 \
+      }                                        \
+    for (i = 0; i < N; i++)                    \
+      exp[i] = s1[i] * s2[i];                  \
+    v1 = *(typeV *)s1;                         \
+    v2 = *(typeV *)s2;                         \
+    res = fn (v1, v2);                         \
+    for (j = 0; j < N; j++)                    \
+      {                                        \
+       if (res[j] != exp[j])                   \
+         abort();                              \
+      }                                        \
+  }                                            \
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_MULB(v8qi, char, 8, mul_128);
+  TEST_MULB(v8uqi, unsigned char, 8, umul_128);
+  TEST_MULB(v16qi, char, 16, mul_256);
+  TEST_MULB(v16uqi, unsigned char, 16, umul_256);
+}