From 2d2bc36c4440c126decee5a8379c158d9012adfc Mon Sep 17 00:00:00 2001 From: liuhongt Date: Thu, 6 Aug 2020 13:48:38 +0800 Subject: [PATCH] Enable direct movement between gpr and mask registers in pass_reload. Changelog gcc/ * config/i386/i386.c (inline_secondary_memory_needed): No memory is needed between mask regs and gpr. (ix86_hard_regno_mode_ok): Add condition TARGET_AVX512F for mask regno. * config/i386/i386.h (enum reg_class): Add INT_MASK_REGS. (REG_CLASS_NAMES): Ditto. (REG_CLASS_CONTENTS): Ditto. * config/i386/i386.md: Exclude mask register in define_peephole2 which is avaiable only for gpr. gcc/testsuite/ * gcc.target/i386/spill_to_mask-1.c: New tests. * gcc.target/i386/spill_to_mask-2.c: New tests. * gcc.target/i386/spill_to_mask-3.c: New tests. * gcc.target/i386/spill_to_mask-4.c: New tests. --- gcc/config/i386/i386.c | 2 +- gcc/config/i386/i386.h | 3 + gcc/config/i386/i386.md | 4 +- .../gcc.target/i386/spill_to_mask-1.c | 92 +++++++++++++++++++ .../gcc.target/i386/spill_to_mask-2.c | 10 ++ .../gcc.target/i386/spill_to_mask-3.c | 10 ++ .../gcc.target/i386/spill_to_mask-4.c | 10 ++ 7 files changed, 128 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/spill_to_mask-1.c create mode 100644 gcc/testsuite/gcc.target/i386/spill_to_mask-2.c create mode 100644 gcc/testsuite/gcc.target/i386/spill_to_mask-3.c create mode 100644 gcc/testsuite/gcc.target/i386/spill_to_mask-4.c diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 548df57d5b3..2025e6b305a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -18971,7 +18971,7 @@ ix86_hard_regno_mode_ok (unsigned int regno, machine_mode mode) if ((mode == P2QImode || mode == P2HImode)) return MASK_PAIR_REGNO_P(regno); - return (VALID_MASK_REG_MODE (mode) + return ((TARGET_AVX512F && VALID_MASK_REG_MODE (mode)) || (TARGET_AVX512BW && VALID_MASK_AVX512BW_MODE (mode))); } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index e0af87450b8..852dd017aa4 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1418,6 +1418,7 @@ enum reg_class FLOAT_INT_SSE_REGS, MASK_REGS, ALL_MASK_REGS, + INT_MASK_REGS, ALL_REGS, LIM_REG_CLASSES }; @@ -1477,6 +1478,7 @@ enum reg_class "FLOAT_INT_SSE_REGS", \ "MASK_REGS", \ "ALL_MASK_REGS", \ + "INT_MASK_REGS", \ "ALL_REGS" } /* Define which registers fit in which classes. This is an initializer @@ -1515,6 +1517,7 @@ enum reg_class { 0xff9ffff, 0xfffffff0, 0xf }, /* FLOAT_INT_SSE_REGS */ \ { 0x0, 0x0, 0xfe0 }, /* MASK_REGS */ \ { 0x0, 0x0, 0xff0 }, /* ALL_MASK_REGS */ \ + { 0x900ff, 0xff0, 0xff0 }, /* INT_MASK_REGS */ \ { 0xffffffff, 0xffffffff, 0xfff } /* ALL_REGS */ \ } diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index fb677e17817..896b99a4857 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -15026,7 +15026,7 @@ ;; Replace zero_extend:HI followed by parityhi2_cmp with parityqi2_cmp (define_peephole2 [(set (match_operand:HI 0 "register_operand") - (zero_extend:HI (match_operand:QI 1 "register_operand"))) + (zero_extend:HI (match_operand:QI 1 "general_reg_operand"))) (parallel [(set (reg:CC FLAGS_REG) (unspec:CC [(match_dup 0)] UNSPEC_PARITY)) (clobber (match_dup 0))])] @@ -15037,7 +15037,7 @@ ;; Eliminate QImode popcount&1 using parity flag (define_peephole2 [(set (match_operand:SI 0 "register_operand") - (zero_extend:SI (match_operand:QI 1 "register_operand"))) + (zero_extend:SI (match_operand:QI 1 "general_reg_operand"))) (parallel [(set (match_operand:SI 2 "register_operand") (popcount:SI (match_dup 0))) (clobber (reg:CC FLAGS_REG))]) diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c new file mode 100644 index 00000000000..c5043e224ea --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-1.c @@ -0,0 +1,92 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512" } */ + +#ifndef DTYPE +#define DTYPE u32 +#endif + +typedef unsigned long long u64; +typedef unsigned int u32; +typedef unsigned short u16; +typedef unsigned char u8; + +#define R(x,n) ( (x >> n) | (x << (32 - n))) + +#define S0(x) (R(x, 2) ^ R(x,13) ^ R(x,22)) +#define S1(x) (R(x, 6) ^ R(x,11) ^ R(x,25)) + +#define TT(a,b,c,d,e,f,g,h,x,K) \ +{ \ + tmp1 = h + S1(e) + (g ^ (e & (f ^ g))) + K + x; \ + tmp2 = S0(a) + ((a & b) | (c & (a | b))); \ + h = tmp1 + tmp2; \ + d += tmp1; \ +} + +static inline DTYPE byteswap(DTYPE x) +{ + x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16; + x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8; + return x; +} + +#define BE_LOAD32(n,b,i) (n) = byteswap(*(DTYPE *)(b + i)) + +void foo (u8 *in, DTYPE out[8], const DTYPE C[16]) +{ + DTYPE tmp1 = 0, tmp2 = 0, a, b, c, d, e, f, g, h; + DTYPE w0, w1, w2, w3, w4, w5, w6, w7, + w8, w9, w10, w11, w12, w13, w14, w15; + w0 = byteswap(*(DTYPE *)(in + 0)); + w1 = byteswap(*(DTYPE *)(in + 4)); + w2 = byteswap(*(DTYPE *)(in + 8)); + w3 = byteswap(*(DTYPE *)(in + 12)); + w4 = byteswap(*(DTYPE *)(in + 16)); + w5 = byteswap(*(DTYPE *)(in + 20)); + w6 = byteswap(*(DTYPE *)(in + 24)); + w7 = byteswap(*(DTYPE *)(in + 28)); + w8 = byteswap(*(DTYPE *)(in + 32)); + w9 = byteswap(*(DTYPE *)(in + 36)); + w10 = byteswap(*(DTYPE *)(in + 40)); + w11 = byteswap(*(DTYPE *)(in + 44)); + w12 = byteswap(*(DTYPE *)(in + 48)); + w13 = byteswap(*(DTYPE *)(in + 52)); + w14 = byteswap(*(DTYPE *)(in + 56)); + w15 = byteswap(*(DTYPE *)(in + 60)); + a = out[0]; + b = out[1]; + c = out[2]; + d = out[3]; + e = out[4]; + f = out[5]; + g = out[6]; + h = out[7]; + + TT(a, b, c, d, e, f, g, h, w0, C[0]); + TT(h, a, b, c, d, e, f, g, w1, C[1]); + TT(g, h, a, b, c, d, e, f, w2, C[2]); + TT(f, g, h, a, b, c, d, e, w3, C[3]); + TT(e, f, g, h, a, b, c, d, w4, C[4]); + TT(d, e, f, g, h, a, b, c, w5, C[5]); + TT(c, d, e, f, g, h, a, b, w6, C[6]); + TT(b, c, d, e, f, g, h, a, w7, C[7]); + TT(a, b, c, d, e, f, g, h, w8, C[8]); + TT(h, a, b, c, d, e, f, g, w9, C[9]); + TT(g, h, a, b, c, d, e, f, w10, C[10]); + TT(f, g, h, a, b, c, d, e, w11, C[11]); + TT(e, f, g, h, a, b, c, d, w12, C[12]); + TT(d, e, f, g, h, a, b, c, w13, C[13]); + TT(c, d, e, f, g, h, a, b, w14, C[14]); + TT(b, c, d, e, f, g, h, a, w15, C[15]); + + out[0] += a; + out[1] += b; + out[2] += c; + out[3] += d; + out[4] += e; + out[5] += f; + out[6] += g; + out[7] += h; +} + +/* { dg-final { scan-assembler "kmovd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c new file mode 100644 index 00000000000..1f0c6b474d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512" } */ + +#ifndef DTYPE +#define DTYPE u16 +#endif + +#include "spill_to_mask-1.c" + +/* { dg-final { scan-assembler "kmovw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c new file mode 100644 index 00000000000..5b59090c296 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-3.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=skylake-avx512" } */ + +#ifndef DTYPE +#define DTYPE u8 +#endif + +#include "spill_to_mask-1.c" + +/* { dg-final { scan-assembler "kmovb" } } */ diff --git a/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c new file mode 100644 index 00000000000..f111cf42b36 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/spill_to_mask-4.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -march=skylake-avx512" } */ + +#ifndef DTYPE +#define DTYPE u64 +#endif + +#include "spill_to_mask-1.c" + +/* { dg-final { scan-assembler "kmovq" } } */ -- 2.30.2