From: Kyrylo Tkachov Date: Tue, 21 Apr 2015 12:56:39 +0000 (+0000) Subject: [expmed] Properly account for the cost and latency of shift+add ops when synthesizing... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=35430ca0c6c34943932b111cb55475f292a0a208;p=gcc.git [expmed] Properly account for the cost and latency of shift+add ops when synthesizing mults * expmed.c: (synth_mult): Only assume overlapping shift with previous steps in alg_sub_t_m2 case. * gcc.target/aarch64/mult-synth_1.c: New test. * gcc.target/aarch64/mult-synth_2.c: Likewise. * gcc.target/aarch64/mult-synth_3.c: Likewise. * gcc.target/aarch64/mult-synth_4.c: Likewise. * gcc.target/aarch64/mult-synth_5.c: Likewise. * gcc.target/aarch64/mult-synth_6.c: Likewise. From-SVN: r222268 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 85c2ba33aed..1e56a373bdc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2015-04-21 Kyrylo Tkachov + + * expmed.c: (synth_mult): Only assume overlapping + shift with previous steps in alg_sub_t_m2 case. + 2015-04-21 Richard Biener PR tree-optimization/65650 diff --git a/gcc/expmed.c b/gcc/expmed.c index 6327629d458..6679f501c2c 100644 --- a/gcc/expmed.c +++ b/gcc/expmed.c @@ -2664,14 +2664,28 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, m = exact_log2 (-orig_t + 1); if (m >= 0 && m < maxm) { - op_cost = shiftsub1_cost (speed, mode, m); + op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m); + /* If the target has a cheap shift-and-subtract insn use + that in preference to a shift insn followed by a sub insn. + Assume that the shift-and-sub is "atomic" with a latency + equal to it's cost, otherwise assume that on superscalar + hardware the shift may be executed concurrently with the + earlier steps in the algorithm. */ + if (shiftsub1_cost (speed, mode, m) <= op_cost) + { + op_cost = shiftsub1_cost (speed, mode, m); + op_latency = op_cost; + } + else + op_latency = add_cost (speed, mode); + new_limit.cost = best_cost.cost - op_cost; - new_limit.latency = best_cost.latency - op_cost; + new_limit.latency = best_cost.latency - op_latency; synth_mult (alg_in, (unsigned HOST_WIDE_INT) (-orig_t + 1) >> m, &new_limit, mode); alg_in->cost.cost += op_cost; - alg_in->cost.latency += op_cost; + alg_in->cost.latency += op_latency; if (CHEAPER_MULT_COST (&alg_in->cost, &best_cost)) { best_cost = alg_in->cost; @@ -2704,20 +2718,12 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, if (t % d == 0 && t > d && m < maxm && (!cache_hit || cache_alg == alg_add_factor)) { - /* If the target has a cheap shift-and-add instruction use - that in preference to a shift insn followed by an add insn. - Assume that the shift-and-add is "atomic" with a latency - equal to its cost, otherwise assume that on superscalar - hardware the shift may be executed concurrently with the - earlier steps in the algorithm. */ op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m); - if (shiftadd_cost (speed, mode, m) < op_cost) - { - op_cost = shiftadd_cost (speed, mode, m); - op_latency = op_cost; - } - else - op_latency = add_cost (speed, mode); + if (shiftadd_cost (speed, mode, m) <= op_cost) + op_cost = shiftadd_cost (speed, mode, m); + + op_latency = op_cost; + new_limit.cost = best_cost.cost - op_cost; new_limit.latency = best_cost.latency - op_latency; @@ -2742,20 +2748,11 @@ synth_mult (struct algorithm *alg_out, unsigned HOST_WIDE_INT t, if (t % d == 0 && t > d && m < maxm && (!cache_hit || cache_alg == alg_sub_factor)) { - /* If the target has a cheap shift-and-subtract insn use - that in preference to a shift insn followed by a sub insn. - Assume that the shift-and-sub is "atomic" with a latency - equal to it's cost, otherwise assume that on superscalar - hardware the shift may be executed concurrently with the - earlier steps in the algorithm. */ op_cost = add_cost (speed, mode) + shift_cost (speed, mode, m); - if (shiftsub0_cost (speed, mode, m) < op_cost) - { - op_cost = shiftsub0_cost (speed, mode, m); - op_latency = op_cost; - } - else - op_latency = add_cost (speed, mode); + if (shiftsub0_cost (speed, mode, m) <= op_cost) + op_cost = shiftsub0_cost (speed, mode, m); + + op_latency = op_cost; new_limit.cost = best_cost.cost - op_cost; new_limit.latency = best_cost.latency - op_latency; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 5573e423ca8..c031a3afc88 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2015-04-21 Kyrylo Tkachov + + * gcc.target/aarch64/mult-synth_1.c: New test. + * gcc.target/aarch64/mult-synth_2.c: Likewise. + * gcc.target/aarch64/mult-synth_3.c: Likewise. + * gcc.target/aarch64/mult-synth_4.c: Likewise. + * gcc.target/aarch64/mult-synth_5.c: Likewise. + * gcc.target/aarch64/mult-synth_6.c: Likewise. + 2015-04-21 Richard Biener PR tree-optimization/65650 diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c new file mode 100644 index 00000000000..37f079d5beb --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_1.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +int +foo (int x) +{ + return x * 100; +} + +/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c new file mode 100644 index 00000000000..4d2e5bf3dc1 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +int +foo (int x) +{ + return x * 25; +} + +/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c new file mode 100644 index 00000000000..03e83e97b50 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +int +foo (int x) +{ + return x * 11; +} + +/* { dg-final { scan-assembler-times "mul\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c new file mode 100644 index 00000000000..05a82bdffe0 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_4.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +long +foo (int x, int y) +{ + return (long)x * 6L; +} + +/* { dg-final { scan-assembler-times "smull\tx\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c new file mode 100644 index 00000000000..8cf3314aa2a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_5.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +int +foo (int x) +{ + return x * 10; +} + +/* { dg-final { scan-assembler-not "\tw1" } } */ +/* { dg-final { cleanup-saved-temps } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c new file mode 100644 index 00000000000..e941b72351e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/mult-synth_6.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mcpu=cortex-a57 -save-temps" } */ + +int +foo (int x) +{ + return x * 20; +} + +/* { dg-final { scan-assembler-not "\tw1" } } */ +/* { dg-final { cleanup-saved-temps } } */