From 001e73373e6d2e7c756141e0d7ac8e24ae1574ad Mon Sep 17 00:00:00 2001 From: Sergey Shalnov Date: Thu, 8 Feb 2018 23:31:15 +0100 Subject: [PATCH] re PR target/83008 ([performance] Is it better to avoid extra instructions in data passing between loops?) PR target/83008 * config/i386/x86-tune-costs.h (skylake_cost): Fix cost of storing integer register in SImode. Fix cost of 256 and 512 byte aligned SSE register store. * config/i386/i386.c (ix86_multiplication_cost): Fix multiplication cost for TARGET_AVX512DQ. testsuite/ChangeLog: PR target/83008 * gcc.target/i386/pr83008.c: New test. From-SVN: r257505 --- gcc/ChangeLog | 12 ++++++++ gcc/config/i386/i386.c | 4 +++ gcc/config/i386/x86-tune-costs.h | 4 +-- gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/gcc.target/i386/pr83008.c | 40 +++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr83008.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4c22855c6c3..dd78a342cdf 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,15 @@ +2018-02-08 Sergey Shalnov + + PR target/83008 + * config/i386/x86-tune-costs.h (skylake_cost): Fix cost of + storing integer register in SImode. Fix cost of 256 and 512 + byte aligned SSE register store. + +2018-02-08 Sergey Shalnov + + * config/i386/i386.c (ix86_multiplication_cost): Fix + multiplication cost for TARGET_AVX512DQ. + 2018-02-08 Marek Polacek PR tree-optimization/84238 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index fc3d6f0aebc..a8709972e9c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -40402,6 +40402,10 @@ ix86_multiplication_cost (const struct processor_costs *cost, ? cost->mulsd : cost->mulss, true); else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) { + /* vpmullq is used in this case. No emulation is needed. */ + if (TARGET_AVX512DQ) + return ix86_vec_cost (mode, cost->mulss, true); + /* V*QImode is emulated with 7-13 insns. */ if (mode == V16QImode || mode == V32QImode) { diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index e943d1386fa..8409a5f166c 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1557,7 +1557,7 @@ struct processor_costs skylake_cost = { {4, 4, 4}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {6, 6, 6}, /* cost of storing integer registers */ + {6, 6, 3}, /* cost of storing integer registers */ 2, /* cost of reg,reg fld/fst */ {6, 6, 8}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ @@ -1572,7 +1572,7 @@ struct processor_costs skylake_cost = { {6, 6, 6, 10, 20}, /* cost of loading SSE registers in 32,64,128,256 and 512-bit */ {6, 6, 6, 10, 20}, /* cost of unaligned loads. */ - {8, 8, 8, 8, 16}, /* cost of storing SSE registers + {8, 8, 8, 12, 24}, /* cost of storing SSE registers in 32,64,128,256 and 512-bit */ {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ 2, 2, /* SSE->integer and integer->SSE moves */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 522af52e008..d9ed50cd322 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2018-02-08 Sergey Shalnov + + PR target/83008 + * gcc.target/i386/pr83008.c: New test. + 2018-02-08 Peter Bergner PR target/81143 diff --git a/gcc/testsuite/gcc.target/i386/pr83008.c b/gcc/testsuite/gcc.target/i386/pr83008.c new file mode 100644 index 00000000000..87a4beae197 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr83008.c @@ -0,0 +1,40 @@ +/* PR target/83008 */ +/* { dg-do compile } */ +/* { dg-options "-Ofast -funroll-loops -march=skylake-avx512 -mfpmath=sse" } */ +/* { dg-final { scan-assembler-not "vmovdq(a|u)(32|64)" } } */ + +int +pr83008 (unsigned char *pix1, int i_pix1, unsigned char *pix2, int i_pix2) +{ + unsigned int tmp[4][4]; + unsigned int a0, a1, a2, a3; + int sum = 0; + for (int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + int t0 = a0 + a1; + int t1 = a0 - a1; + int t2 = a2 + a3; + int t3 = a2 - a3; + tmp[i][0] = t0 + t2; + tmp[i][2] = t0 - t2; + tmp[i][1] = t1 + t3; + tmp[i][3] = t1 - t3; + } + for (int i = 0; i < 4; i++) + { + int t0 = tmp[0][i] + tmp[1][i]; + int t1 = tmp[0][i] - tmp[1][i]; + int t2 = tmp[2][i] + tmp[3][i]; + int t3 = tmp[2][i] - tmp[3][i]; + a0 = t0 + t2; + a2 = t0 - t2; + a1 = t1 + t3; + a3 = t1 - t3; + sum += (a0) + (a1) + (a2) + (a3); + } + return (sum + ((unsigned int) sum >> 16)) >> 1; +} -- 2.30.2