From a2ef9558d17aeb038cbc8a66a203f7a8e6c6e81e Mon Sep 17 00:00:00 2001 From: Markus Trippelsdorf Date: Sun, 17 Dec 2017 12:01:25 +0000 Subject: [PATCH] Correct imul (r64) latency for modern Intel CPUs Since Sandybridge the 64bit multiplication latency is three cycles, not four. So update the costs to reflect reality. * x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply latencies. * gcc.target/i386/wmul-3.c: New test. From-SVN: r255760 --- gcc/ChangeLog | 5 ++ gcc/config/i386/x86-tune-costs.h | 9 ++-- gcc/testsuite/ChangeLog | 4 ++ gcc/testsuite/gcc.target/i386/wmul-3.c | 66 ++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/wmul-3.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5209a5a22eb..0f0418590be 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2017-12-17 Markus Trippelsdorf + + * x86-tune-costs.h (skylake_cost, core_cost): Decrease r64 multiply + latencies. + 2017-12-16 Sandra Loosemore * doc/invoke.texi: Fix some typos. diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h index 64821933830..477e478f1f7 100644 --- a/gcc/config/i386/x86-tune-costs.h +++ b/gcc/config/i386/x86-tune-costs.h @@ -1538,8 +1538,8 @@ struct processor_costs skylake_cost = { {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ COSTS_N_INSNS (4), /* HI */ COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (4)}, /* other */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ 0, /* cost of multiply per each bit set */ /* Expanding div/mod currently doesn't consider parallelism. So the cost model is not realistic. We compensate by increasing the latencies a bit. */ @@ -2341,8 +2341,9 @@ struct processor_costs core_cost = { {COSTS_N_INSNS (3), /* cost of starting multiply for QI */ COSTS_N_INSNS (4), /* HI */ COSTS_N_INSNS (3), /* SI */ - COSTS_N_INSNS (4), /* DI */ - COSTS_N_INSNS (4)}, /* other */ + /* Here we tune for Sandybridge or newer. */ + COSTS_N_INSNS (3), /* DI */ + COSTS_N_INSNS (3)}, /* other */ 0, /* cost of multiply per each bit set */ /* Expanding div/mod currently doesn't consider parallelism. So the cost model is not realistic. We compensate by increasing the latencies a bit. */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 70e9bcf133b..c7d3977e43e 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,7 @@ +2017-12-17 Markus Trippelsdorf + + * gcc.target/i386/wmul-3.c: New test. + 2017-12-16 Martin Sebor PR tree-optimization/78918 diff --git a/gcc/testsuite/gcc.target/i386/wmul-3.c b/gcc/testsuite/gcc.target/i386/wmul-3.c new file mode 100644 index 00000000000..5f161907638 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/wmul-3.c @@ -0,0 +1,66 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -march=sandybridge" } */ + +#include +#include + +static const char b100_tab[200] = { + '0', '0', '0', '1', '0', '2', '0', '3', '0', '4', + '0', '5', '0', '6', '0', '7', '0', '8', '0', '9', + '1', '0', '1', '1', '1', '2', '1', '3', '1', '4', + '1', '5', '1', '6', '1', '7', '1', '8', '1', '9', + '2', '0', '2', '1', '2', '2', '2', '3', '2', '4', + '2', '5', '2', '6', '2', '7', '2', '8', '2', '9', + '3', '0', '3', '1', '3', '2', '3', '3', '3', '4', + '3', '5', '3', '6', '3', '7', '3', '8', '3', '9', + '4', '0', '4', '1', '4', '2', '4', '3', '4', '4', + '4', '5', '4', '6', '4', '7', '4', '8', '4', '9', + '5', '0', '5', '1', '5', '2', '5', '3', '5', '4', + '5', '5', '5', '6', '5', '7', '5', '8', '5', '9', + '6', '0', '6', '1', '6', '2', '6', '3', '6', '4', + '6', '5', '6', '6', '6', '7', '6', '8', '6', '9', + '7', '0', '7', '1', '7', '2', '7', '3', '7', '4', + '7', '5', '7', '6', '7', '7', '7', '8', '7', '9', + '8', '0', '8', '1', '8', '2', '8', '3', '8', '4', + '8', '5', '8', '6', '8', '7', '8', '8', '8', '9', + '9', '0', '9', '1', '9', '2', '9', '3', '9', '4', + '9', '5', '9', '6', '9', '7', '9', '8', '9', '9', +}; + +void uint64_to_ascii_ta7_32_base100(uint64_t val, char *dst) { + const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000; + const uint64_t POW2_57_DIV_POW100_4 = + ((int64_t)(1) << 57) / 100 / 100 / 100 / 100 + 1; + const uint64_t MASK32 = ((int64_t)(1) << 32) - 1; + int64_t hix = val / POW10_10; + int64_t lox = val % POW10_10; + int64_t lor = lox & (uint64_t)(-2); + uint64_t hi = hix * POW2_57_DIV_POW100_4; + uint64_t lo = lor * POW2_57_DIV_POW100_4; + memcpy(dst + 0 * 10 + 0, &b100_tab[(hi >> 57) * 2], 2); + memcpy(dst + 1 * 10 + 0, &b100_tab[(lo >> 57) * 2], 2); + hi = (hi >> 25) + 1; + lo = (lo >> 25) + 1; + hi = (hi & MASK32) * 100; + lo = (lo & MASK32) * 100; + memcpy(dst + 0 * 10 + 2, &b100_tab[(hi >> 32) * 2], 2); + hi = (hi & MASK32) * 100; + memcpy(dst + 1 * 10 + 2, &b100_tab[(lo >> 32) * 2], 2); + lo = (lo & MASK32) * 100; + memcpy(dst + 0 * 10 + 4, &b100_tab[(hi >> 32) * 2], 2); + hi = (hi & MASK32) * 100; + memcpy(dst + 1 * 10 + 4, &b100_tab[(lo >> 32) * 2], 2); + lo = (lo & MASK32) * 100; + memcpy(dst + 0 * 10 + 6, &b100_tab[(hi >> 32) * 2], 2); + hi = (hi & MASK32) * 100; + memcpy(dst + 1 * 10 + 6, &b100_tab[(lo >> 32) * 2], 2); + lo = (lo & MASK32) * 100; + hi >>= 32; + lo >>= 32; + lo = (lo & (-2)) | (lox & 1); + memcpy(dst + 0 * 10 + 8, &b100_tab[hi * 2], 2); + memcpy(dst + 1 * 10 + 8, &b100_tab[lo * 2], 2); + dst[2 * 10] = 0; +} + +/* { dg-final { scan-assembler-times "imulq" 11 } } */ -- 2.30.2