From: liuhongt Date: Wed, 3 Jun 2020 09:25:47 +0000 (+0800) Subject: Optimize multiplication for V8QI,V16QI,V32QI under TARGET_AVX512BW. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=54cdb2f5a5b01a482d7cbce30e7b738558eecf59;p=gcc.git Optimize multiplication for V8QI,V16QI,V32QI under TARGET_AVX512BW. 2020-06-13 Hongtao Liu gcc/ChangeLog: PR target/95488 * config/i386/i386-expand.c (ix86_expand_vecmul_qihi): New function. * config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Declare. * config/i386/sse.md (mul3): Drop mask_name since there's no real simd int8 multiplication instruction with mask. Also optimize it under TARGET_AVX512BW. (mulv8qi3): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/avx512bw-pr95488-1.c: New test. * gcc.target/i386/avx512bw-pr95488-2.c: Ditto. * gcc.target/i386/avx512vl-pr95488-1.c: Ditto. * gcc.target/i386/avx512vl-pr95488-2.c: Ditto. --- diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 270585decb2..3a414f69b3b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -19466,6 +19466,71 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p) gcc_assert (ok); } +/* Optimize vector MUL generation for V8QI, V16QI and V32QI + under TARGET_AVX512BW. i.e. for v16qi a * b, it has + + vpmovzxbw ymm2, xmm0 + vpmovzxbw ymm3, xmm1 + vpmullw ymm4, ymm2, ymm3 + vpmovwb xmm0, ymm4 + + it would take less instructions than ix86_expand_vecop_qihi. + Return true if success. */ + +bool +ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2) +{ + machine_mode himode, qimode = GET_MODE (dest); + rtx hop1, hop2, hdest; + rtx (*gen_extend)(rtx, rtx); + rtx (*gen_truncate)(rtx, rtx); + + /* There's no V64HImode multiplication instruction. */ + if (qimode == E_V64QImode) + return false; + + /* vpmovwb only available under AVX512BW. */ + if (!TARGET_AVX512BW) + return false; + if ((qimode == V8QImode || qimode == V16QImode) + && !TARGET_AVX512VL) + return false; + /* Not generate zmm instruction when prefer 128/256 bit vector width. */ + if (qimode == V32QImode + && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256)) + return false; + + switch (qimode) + { + case E_V8QImode: + himode = V8HImode; + gen_extend = gen_zero_extendv8qiv8hi2; + gen_truncate = gen_truncv8hiv8qi2; + break; + case E_V16QImode: + himode = V16HImode; + gen_extend = gen_zero_extendv16qiv16hi2; + gen_truncate = gen_truncv16hiv16qi2; + break; + case E_V32QImode: + himode = V32HImode; + gen_extend = gen_zero_extendv32qiv32hi2; + gen_truncate = gen_truncv32hiv32qi2; + break; + default: + gcc_unreachable (); + } + + hop1 = gen_reg_rtx (himode); + hop2 = gen_reg_rtx (himode); + hdest = gen_reg_rtx (himode); + emit_insn (gen_extend (hop1, op1)); + emit_insn (gen_extend (hop2, op2)); + emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode, + hop1, hop2))); + emit_insn (gen_truncate (dest, hdest)); + return true; +} /* Expand a vector operation CODE for a V*QImode in terms of the same operation on V*HImode. */ diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index e5574496bb7..f5320494fa1 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -204,6 +204,7 @@ extern void ix86_expand_round (rtx, rtx); extern void ix86_expand_rounddf_32 (rtx, rtx); extern void ix86_expand_round_sse4 (rtx, rtx); +extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx); extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx); extern rtx ix86_split_stack_guard (void); diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 7815d77bcbf..aa9fdc87c68 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11658,12 +11658,24 @@ (set_attr "prefix" "orig,maybe_evex") (set_attr "mode" "TI")]) -(define_expand "mul3" +(define_expand "mulv8qi3" + [(set (match_operand:V8QI 0 "register_operand") + (mult:V8QI (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_operand")))] + "TARGET_AVX512VL && TARGET_AVX512BW" +{ + gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "mul3" [(set (match_operand:VI1_AVX512 0 "register_operand") (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand") (match_operand:VI1_AVX512 2 "register_operand")))] - "TARGET_SSE2 && && " + "TARGET_SSE2" { + if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2])) + DONE; ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); DONE; }) diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c new file mode 100644 index 00000000000..594e511868d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c @@ -0,0 +1,21 @@ +/* PR target/95488 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw" } */ +/* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */ +/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 2 } } */ + +typedef char v32qi __attribute__ ((vector_size (32))); +typedef unsigned char v32uqi __attribute__ ((vector_size (32))); + +__attribute__((noipa)) v32qi +mul_512 (v32qi a, v32qi b) +{ + return a * b; +} + +__attribute__((noipa)) v32uqi +umul_512 (v32uqi a, v32uqi b) +{ + return a * b; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c new file mode 100644 index 00000000000..de319664618 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512bw } */ +/* { dg-options "-O2 -mavx512bw -mavx512vl" } */ + +#ifndef CHECK +#define CHECK "avx512f-helper.h" +#endif + +#include CHECK + +#ifndef TEST +#define TEST avx512bw_test +#endif + +#include "avx512bw-pr95488-1.c" + +#define TEST_MULB(typeV, typeS, N, fn) \ +do \ + { \ + typeV v1, v2, res; \ + typeS s1[N], s2[N], exp[N]; \ + int i,j; \ + \ + for (i = 0; i < N; i++) \ + { \ + s1[i] = i * i; \ + s2[i] = i + 20; \ + } \ + for (i = 0; i < N; i++) \ + exp[i] = s1[i] * s2[i]; \ + v1 = *(typeV *)&s1[0]; \ + v2 = *(typeV *)&s2[0]; \ + res = fn (v1, v2); \ + for (j = 0; j < N; j++) \ + { \ + if (res[j] != exp[j]) \ + abort(); \ + } \ +} \ +while (0) + +static void +TEST (void) +{ + TEST_MULB (v32qi, char, 32, mul_512); + TEST_MULB (v32uqi, unsigned char, 32, umul_512); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c new file mode 100644 index 00000000000..b3674fbd04f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c @@ -0,0 +1,36 @@ +/* PR target/pr95488 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx512bw -mavx512vl" } */ +/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */ +/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */ +/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */ + +typedef char v16qi __attribute__ ((vector_size (16))); +typedef char v8qi __attribute__ ((vector_size (8))); +typedef unsigned char v16uqi __attribute__ ((vector_size (16))); +typedef unsigned char v8uqi __attribute__ ((vector_size (8))); + +__attribute__((noipa)) v8qi +mul_128 (v8qi a, v8qi b) +{ + return a * b; +} + +__attribute__((noipa)) v16qi +mul_256 (v16qi a, v16qi b) +{ + return a * b; +} + +__attribute__((noipa)) v8uqi +umul_128 (v8uqi a, v8uqi b) +{ + return a * b; +} + +__attribute__((noipa)) v16uqi +umul_256 (v16uqi a, v16uqi b) +{ + return a * b; +} diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c new file mode 100644 index 00000000000..45d7437bab6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c @@ -0,0 +1,50 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512bw } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -mavx512bw -mavx512vl" } */ + +#ifndef CHECK +#define CHECK "avx512f-helper.h" +#endif + +#include CHECK + +#ifndef TEST +#define TEST avx512bw_test +#endif + +#include "avx512vl-pr95488-1.c" + +#define TEST_MULB(typeV, typeS, N, fn) \ +do \ + { \ + typeV v1, v2, res; \ + int i,j; \ + typeS s1[N], s2[N], exp[N]; \ + \ + for (i = 0; i < N; i++) \ + { \ + s1[i] = i * i; \ + s2[i] = i + 20; \ + } \ + for (i = 0; i < N; i++) \ + exp[i] = s1[i] * s2[i]; \ + v1 = *(typeV *)s1; \ + v2 = *(typeV *)s2; \ + res = fn (v1, v2); \ + for (j = 0; j < N; j++) \ + { \ + if (res[j] != exp[j]) \ + abort(); \ + } \ + } \ +while (0) + +static void +TEST (void) +{ + TEST_MULB(v8qi, char, 8, mul_128); + TEST_MULB(v8uqi, unsigned char, 8, umul_128); + TEST_MULB(v16qi, char, 16, mul_256); + TEST_MULB(v16uqi, unsigned char, 16, umul_256); +}