From 7813b280435f2e19c53df9f8b04a3d28bb561aa8 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 29 Jun 2017 09:21:57 +0000 Subject: [PATCH] re PR target/70119 (AArch64 should take advantage of implicit truncation of variable shift amount without defining SHIFT_COUNT_TRUNCATED) 2017-06-29 Kyrylo Tkachov Michael Collison PR target/70119 * config/aarch64/aarch64.md (*aarch64__reg_3_mask1): New pattern. (*aarch64_reg_3_neg_mask2): New pattern. (*aarch64_reg_3_minus_mask): New pattern. (*aarch64__reg_di3_mask2): New pattern. * config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost of shift when the shift amount is masked with constant equal to the size of the mode. * config/aarch64/predicates.md (subreg_lowpart_operator): New predicate. 2017-06-29 Kyrylo Tkachov Michael Collison PR target/70119 * gcc.target/aarch64/var_shift_mask_1.c: New test. Co-Authored-By: Michael Collison From-SVN: r249774 --- gcc/ChangeLog | 15 +++ gcc/config/aarch64/aarch64.c | 50 +++++++--- gcc/config/aarch64/aarch64.md | 91 +++++++++++++++++++ gcc/config/aarch64/predicates.md | 4 + gcc/testsuite/ChangeLog | 6 ++ .../gcc.target/aarch64/var_shift_mask_1.c | 61 +++++++++++++ 6 files changed, 213 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4f117a0bb7f..17feaec7574 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2017-06-29 Kyrylo Tkachov + Michael Collison + + PR target/70119 + * config/aarch64/aarch64.md (*aarch64__reg_3_mask1): + New pattern. + (*aarch64_reg_3_neg_mask2): New pattern. + (*aarch64_reg_3_minus_mask): New pattern. + (*aarch64__reg_di3_mask2): New pattern. + * config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost + of shift when the shift amount is masked with constant equal to + the size of the mode. + * config/aarch64/predicates.md (subreg_lowpart_operator): New + predicate. + 2017-06-29 Martin Liska * config/i386/i386.opt: Change range from [1,5] to [0,5]. diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 593263ff9b2..5cf41fc0606 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -7541,17 +7541,26 @@ cost_plus: } else { - if (speed) + if (VECTOR_MODE_P (mode)) { - if (VECTOR_MODE_P (mode)) - { - /* Vector shift (register). */ - *cost += extra_cost->vect.alu; - } - else + if (speed) + /* Vector shift (register). */ + *cost += extra_cost->vect.alu; + } + else + { + if (speed) + /* LSLV. */ + *cost += extra_cost->alu.shift_reg; + + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) + && CONST_INT_P (XEXP (op1, 1)) + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) { - /* LSLV. */ - *cost += extra_cost->alu.shift_reg; + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); + /* We already demanded XEXP (op1, 0) to be REG_P, so + don't recurse into it. */ + return true; } } return false; /* All arguments need to be in registers. */ @@ -7580,14 +7589,27 @@ cost_plus: } else { - - /* ASR (register) and friends. */ - if (speed) + if (VECTOR_MODE_P (mode)) { - if (VECTOR_MODE_P (mode)) + if (speed) + /* Vector shift (register). */ *cost += extra_cost->vect.alu; - else + } + else + { + if (speed) + /* ASR (register) and friends. */ *cost += extra_cost->alu.shift_reg; + + if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) + && CONST_INT_P (XEXP (op1, 1)) + && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1) + { + *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); + /* We already demanded XEXP (op1, 0) to be REG_P, so + don't recurse into it. */ + return true; + } } return false; /* All arguments need to be in registers. */ } diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6bdbf650d92..e6e7e64390c 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -3942,6 +3942,97 @@ } ) +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments +;; they truncate the shift/rotate amount by the size of the registers they +;; operate on: 32 for W-regs, 64 for X-regs. This allows us to optimise away +;; such redundant masking instructions. GCC can do that automatically when +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD +;; because some of the SISD shift alternatives don't perform this truncations. +;; So this pattern exists to catch such cases. + +(define_insn "*aarch64__reg_3_mask1" + [(set (match_operand:GPI 0 "register_operand" "=r") + (SHIFT:GPI + (match_operand:GPI 1 "register_operand" "r") + (match_operator 4 "subreg_lowpart_operator" + [(and:GPI (match_operand:GPI 2 "register_operand" "r") + (match_operand 3 "const_int_operand" "n"))])))] + "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode) - 1)) == 0" + "\t%0, %1, %2" + [(set_attr "type" "shift_reg")] +) + +(define_insn_and_split "*aarch64_reg_3_neg_mask2" + [(set (match_operand:GPI 0 "register_operand" "=&r") + (SHIFT:GPI + (match_operand:GPI 1 "register_operand" "r") + (match_operator 4 "subreg_lowpart_operator" + [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r") + (match_operand 3 "const_int_operand" "n")))])))] + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (mode) - 1)) == 0)" + "#" + "&& true" + [(const_int 0)] + { + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode) + : operands[0]); + emit_insn (gen_negsi2 (tmp, operands[2])); + + rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]); + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op, + SUBREG_BYTE (operands[4])); + emit_insn (gen_3 (operands[0], operands[1], subreg_tmp)); + DONE; + } +) + +(define_insn_and_split "*aarch64_reg_3_minus_mask" + [(set (match_operand:GPI 0 "register_operand" "=&r") + (ashift:GPI + (match_operand:GPI 1 "register_operand" "r") + (minus:QI (match_operand 2 "const_int_operand" "n") + (match_operator 5 "subreg_lowpart_operator" + [(and:SI (match_operand:SI 3 "register_operand" "r") + (match_operand 4 "const_int_operand" "n"))]))))] + "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (mode) - 1)) == 0) + && INTVAL (operands[2]) == GET_MODE_BITSIZE (mode)" + "#" + "&& true" + [(const_int 0)] + { + rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode) + : operands[0]); + + emit_insn (gen_negsi2 (tmp, operands[3])); + + rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]); + rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op, + SUBREG_BYTE (operands[5])); + + emit_insn (gen_ashl3 (operands[0], operands[1], subreg_tmp)); + DONE; + } +) + +(define_insn "*aarch64__reg_di3_mask2" + [(set (match_operand:DI 0 "register_operand" "=r") + (SHIFT:DI + (match_operand:DI 1 "register_operand" "r") + (match_operator 4 "subreg_lowpart_operator" + [(and:SI (match_operand:SI 2 "register_operand" "r") + (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))] + "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)" +{ + rtx xop[3]; + xop[0] = operands[0]; + xop[1] = operands[1]; + xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]); + output_asm_insn ("\t%x0, %x1, %x2", xop); + return ""; +} + [(set_attr "type" "shift_reg")] +) + ;; Logical left shift using SISD or Integer instruction (define_insn "*aarch64_ashl_sisd_or_int_3" [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w") diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index cd7ded98663..ad8a43c2b2c 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -35,6 +35,10 @@ (and (match_code "const_int") (match_test "op == CONST0_RTX (mode)"))) +(define_special_predicate "subreg_lowpart_operator" + (and (match_code "subreg") + (match_test "subreg_lowpart_p (op)"))) + (define_predicate "aarch64_ccmp_immediate" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), -31, 31)"))) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 661ef8fcd1a..54a2bc134a8 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2016-06-29 Kyrylo Tkachov + Michael Collison + + PR target/70119 + * gcc.target/aarch64/var_shift_mask_1.c: New test. + 2017-06-28 Sebastian Peryt * gcc.target/i386/avx512vl-vpermd-1.c (_mm256_permutexvar_epi32): diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c new file mode 100644 index 00000000000..e2b020ef3c4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* The integer variable shift and rotate instructions truncate their + shift amounts by the datasize. Make sure that we don't emit a redundant + masking operation. */ + +unsigned +f1 (unsigned x, int y) +{ + return x << (y & 31); +} + +unsigned long +f2 (unsigned long x, int y) +{ + return x << (y & 63); +} + +unsigned long +f3 (unsigned long bit_addr, int y) +{ + unsigned long bitnumb = bit_addr & 63; + return (1L << bitnumb); +} + +unsigned int +f4 (unsigned int x, unsigned int y) +{ + y &= 31; + return x >> y | (x << (32 - y)); +} + +unsigned long +f5 (unsigned long x, unsigned long y) +{ + y &= 63; + return x >> y | (x << (64 - y)); +} + +unsigned long +f6 (unsigned long x, unsigned long y) +{ + + return (x << (64 - (y & 63))); + +} + +unsigned long +f7 (unsigned long x, unsigned long y) +{ + return (x << -(y & 63)); +} + +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */ +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */ +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */ +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */ +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */ -- 2.30.2