From f08995eefbf579acfe40f0204727d5ce388e3d0a Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Sun, 7 Jun 2020 22:07:28 +0200 Subject: [PATCH] i386: Improve expansion of __builtin_parity GCC currently hides the shift and xor reduction inside a backend specific UNSPEC PARITY, making it invisible to the RTL optimizers until very late during compilation. It is normally reasonable for the middle-end to maintain wider mode representations for as long as possible and split them later, but this only helps if the semantics are visible at the RTL-level (to combine and other passes), but UNSPECs are black boxes, so in this case splitting early (during RTL expansion) is a better strategy. It turns out that that popcount instruction on modern x86_64 processors has (almost) made the integer parity flag in the x86 ALU completely obsolete, especially as POPCOUNT's integer semantics are a much better fit to RTL. The one remaining case where these transistors are useful is where __builtin_parity is immediately tested by a conditional branch, and therefore the result is wanted in a flags register rather than as an integer. This case is captured by two peephole2 optimizations in the attached patch. 2020-06-07 Roger Sayle gcc/ChangeLog: * config/i386/i386.md (paritydi2, paritysi2): Expand reduction via shift and xor to an USPEC PARITY matching a parityhi2_cmp. (paritydi2_cmp, paritysi2_cmp): Delete these define_insn_and_split. (parityhi2, parityqi2): New expanders. (parityhi2_cmp): Implement set parity flag with xorb insn. (parityqi2_cmp): Implement set parity flag with testb insn. New peephole2s to use these insns (UNSPEC PARITY) when appropriate. gcc/testsuite/ChangeLog: * gcc.target/i386/parity-3.c: New test. * gcc.target/i386/parity-4.c: Likewise. * gcc.target/i386/parity-5.c: Likewise. * gcc.target/i386/parity-6.c: Likewise. * gcc.target/i386/parity-7.c: Likewise. * gcc.target/i386/parity-8.c: Likewise. * gcc.target/i386/parity-9.c: Likewise. --- gcc/config/i386/i386.md | 196 +++++++++++++++++------ gcc/testsuite/gcc.target/i386/parity-3.c | 24 +++ gcc/testsuite/gcc.target/i386/parity-4.c | 24 +++ gcc/testsuite/gcc.target/i386/parity-5.c | 9 ++ gcc/testsuite/gcc.target/i386/parity-6.c | 9 ++ gcc/testsuite/gcc.target/i386/parity-7.c | 15 ++ gcc/testsuite/gcc.target/i386/parity-8.c | 13 ++ gcc/testsuite/gcc.target/i386/parity-9.c | 33 ++++ 8 files changed, 270 insertions(+), 53 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/parity-3.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-4.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-5.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-6.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-7.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-8.c create mode 100644 gcc/testsuite/gcc.target/i386/parity-9.c diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index a8592a95fe1..9db7469dfcc 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -14866,9 +14866,32 @@ "! TARGET_POPCNT" { rtx scratch = gen_reg_rtx (QImode); + rtx hipart1 = gen_reg_rtx (SImode); + rtx lopart1 = gen_reg_rtx (SImode); + rtx xor1 = gen_reg_rtx (SImode); + rtx shift2 = gen_reg_rtx (SImode); + rtx hipart2 = gen_reg_rtx (HImode); + rtx lopart2 = gen_reg_rtx (HImode); + rtx xor2 = gen_reg_rtx (HImode); - emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX, - NULL_RTX, operands[1])); + if (TARGET_64BIT) + { + rtx shift1 = gen_reg_rtx (DImode); + emit_insn (gen_lshrdi3 (shift1, operands[1], GEN_INT (32))); + emit_move_insn (hipart1, gen_lowpart (SImode, shift1)); + } + else + emit_move_insn (hipart1, gen_highpart (SImode, operands[1])); + + emit_move_insn (lopart1, gen_lowpart (SImode, operands[1])); + emit_insn (gen_xorsi3 (xor1, hipart1, lopart1)); + + emit_insn (gen_lshrsi3 (shift2, xor1, GEN_INT (16))); + emit_move_insn (hipart2, gen_lowpart (HImode, shift2)); + emit_move_insn (lopart2, gen_lowpart (HImode, xor1)); + emit_insn (gen_xorhi3 (xor2, hipart2, lopart2)); + + emit_insn (gen_parityhi2_cmp (xor2)); ix86_expand_setcc (scratch, ORDERED, gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx); @@ -14891,8 +14914,17 @@ "! TARGET_POPCNT" { rtx scratch = gen_reg_rtx (QImode); + rtx shift = gen_reg_rtx (SImode); + rtx hipart = gen_reg_rtx (HImode); + rtx lopart = gen_reg_rtx (HImode); + rtx tmp = gen_reg_rtx (HImode); + + emit_insn (gen_lshrsi3 (shift, operands[1], GEN_INT (16))); + emit_move_insn (hipart, gen_lowpart (HImode, shift)); + emit_move_insn (lopart, gen_lowpart (HImode, operands[1])); + emit_insn (gen_xorhi3 (tmp, hipart, lopart)); - emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1])); + emit_insn (gen_parityhi2_cmp (tmp)); ix86_expand_setcc (scratch, ORDERED, gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx); @@ -14901,70 +14933,128 @@ DONE; }) -(define_insn_and_split "paritydi2_cmp" - [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_operand:DI 3 "register_operand" "0")] - UNSPEC_PARITY)) - (clobber (match_scratch:DI 0 "=r")) - (clobber (match_scratch:SI 1 "=&r")) - (clobber (match_scratch:HI 2 "=Q"))] +(define_expand "parityhi2" + [(set (match_operand:HI 0 "register_operand") + (parity:HI (match_operand:HI 1 "register_operand")))] "! TARGET_POPCNT" - "#" - "&& reload_completed" - [(parallel - [(set (match_dup 1) - (xor:SI (match_dup 1) (match_dup 4))) - (clobber (reg:CC FLAGS_REG))]) - (parallel - [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_dup 1)] UNSPEC_PARITY)) - (clobber (match_dup 1)) - (clobber (match_dup 2))])] { - operands[4] = gen_lowpart (SImode, operands[3]); + rtx scratch = gen_reg_rtx (QImode); - if (TARGET_64BIT) - { - emit_move_insn (operands[1], gen_lowpart (SImode, operands[3])); - emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32))); - } - else - operands[1] = gen_highpart (SImode, operands[3]); + emit_insn (gen_parityhi2_cmp (operands[1])); + + ix86_expand_setcc (scratch, ORDERED, + gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx); + + emit_insn (gen_zero_extendqihi2 (operands[0], scratch)); + DONE; }) -(define_insn_and_split "paritysi2_cmp" - [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_operand:SI 2 "register_operand" "0")] - UNSPEC_PARITY)) - (clobber (match_scratch:SI 0 "=r")) - (clobber (match_scratch:HI 1 "=&Q"))] +(define_expand "parityqi2" + [(set (match_operand:QI 0 "register_operand") + (parity:QI (match_operand:QI 1 "register_operand")))] "! TARGET_POPCNT" - "#" - "&& reload_completed" - [(parallel - [(set (match_dup 1) - (xor:HI (match_dup 1) (match_dup 3))) - (clobber (reg:CC FLAGS_REG))]) - (parallel - [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_dup 1)] UNSPEC_PARITY)) - (clobber (match_dup 1))])] { - operands[3] = gen_lowpart (HImode, operands[2]); + emit_insn (gen_parityqi2_cmp (operands[1])); - emit_move_insn (operands[1], gen_lowpart (HImode, operands[2])); - emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16))); + ix86_expand_setcc (operands[0], ORDERED, + gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx); + DONE; }) -(define_insn "*parityhi2_cmp" +(define_insn "parityhi2_cmp" [(set (reg:CC FLAGS_REG) - (unspec:CC [(match_operand:HI 1 "register_operand" "0")] + (unspec:CC [(match_operand:HI 0 "register_operand" "+Q")] UNSPEC_PARITY)) - (clobber (match_scratch:HI 0 "=Q"))] - "! TARGET_POPCNT" + (clobber (match_dup 0))] + "" "xor{b}\t{%h0, %b0|%b0, %h0}" [(set_attr "length" "2") - (set_attr "mode" "HI")]) + (set_attr "mode" "QI")]) + +(define_insn "parityqi2_cmp" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_operand:QI 0 "register_operand" "q")] + UNSPEC_PARITY))] + "" + "test{b}\t%0, %0" + [(set_attr "mode" "QI")]) + +;; Replace zero_extend:HI followed by parityhi2_cmp with parityqi2_cmp +(define_peephole2 + [(set (match_operand:HI 0 "register_operand") + (zero_extend:HI (match_operand:QI 1 "register_operand"))) + (parallel [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_PARITY)) + (clobber (match_dup 0))])] + "" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 1)] UNSPEC_PARITY))]) + +;; Eliminate QImode popcount&1 using parity flag +(define_peephole2 + [(set (match_operand:SI 0 "register_operand") + (zero_extend:SI (match_operand:QI 1 "register_operand"))) + (parallel [(set (match_operand:SI 2 "register_operand") + (popcount:SI (match_dup 0))) + (clobber (reg:CC FLAGS_REG))]) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 3 "register_operand") + (const_int 1)) + (const_int 0))) + (set (pc) (if_then_else (match_operator 4 "bt_comparison_operator" + [(reg:CCZ FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 5)) + (pc)))] + "REGNO (operands[2]) == REGNO (operands[3]) + && peep2_reg_dead_p (3, operands[0]) + && peep2_reg_dead_p (3, operands[2]) + && peep2_regno_dead_p (4, FLAGS_REG)" + [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 1)] UNSPEC_PARITY)) + (set (pc) (if_then_else (match_op_dup 4 [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_dup 5)) + (pc)))] +{ + operands[4] = shallow_copy_rtx (operands[4]); + PUT_CODE (operands[4], GET_CODE (operands[4]) == EQ ? UNORDERED : ORDERED); +}) + +;; Eliminate HImode popcount&1 using parity flag +(define_peephole2 + [(match_scratch:HI 0 "Q") + (parallel [(set (match_operand:HI 1 "register_operand") + (popcount:HI + (match_operand:HI 2 "nonimmediate_operand"))) + (clobber (reg:CC FLAGS_REG))]) + (set (match_operand 3 "register_operand") + (zero_extend (match_dup 1))) + (set (reg:CCZ FLAGS_REG) + (compare:CCZ (and:QI (match_operand:QI 4 "register_operand") + (const_int 1)) + (const_int 0))) + (set (pc) (if_then_else (match_operator 5 "bt_comparison_operator" + [(reg:CCZ FLAGS_REG) + (const_int 0)]) + (label_ref (match_operand 6)) + (pc)))] + "REGNO (operands[3]) == REGNO (operands[4]) + && peep2_reg_dead_p (3, operands[1]) + && peep2_reg_dead_p (3, operands[3]) + && peep2_regno_dead_p (4, FLAGS_REG)" + [(set (match_dup 0) (match_dup 2)) + (parallel [(set (reg:CC FLAGS_REG) + (unspec:CC [(match_dup 0)] UNSPEC_PARITY)) + (clobber (match_dup 0))]) + (set (pc) (if_then_else (match_op_dup 5 [(reg:CC FLAGS_REG) + (const_int 0)]) + (label_ref (match_dup 6)) + (pc)))] +{ + operands[5] = shallow_copy_rtx (operands[5]); + PUT_CODE (operands[5], GET_CODE (operands[5]) == EQ ? UNORDERED : ORDERED); +}) ;; Thread-local storage patterns for ELF. diff --git a/gcc/testsuite/gcc.target/i386/parity-3.c b/gcc/testsuite/gcc.target/i386/parity-3.c new file mode 100644 index 00000000000..6b72591aad1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-3.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */ +/* { dg-final { scan-assembler "setp" } } */ +/* { dg-final { scan-assembler "jnp" } } */ +/* { dg-final { scan-assembler "jp" } } */ + +void dummy(void); + +int foo(unsigned int x) +{ + return !__builtin_parity(x); +} + +void bar(unsigned int x) +{ + if (__builtin_parity(x)) + dummy(); +} + +void baz(unsigned int x) +{ + if (!__builtin_parity(x)) + dummy(); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-4.c b/gcc/testsuite/gcc.target/i386/parity-4.c new file mode 100644 index 00000000000..48384c23ea1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-4.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */ +/* { dg-final { scan-assembler "setp" } } */ +/* { dg-final { scan-assembler "jnp" } } */ +/* { dg-final { scan-assembler "jp" } } */ + +void dummy(void); + +int foo(unsigned long long x) +{ + return !__builtin_parityll(x); +} + +void bar(unsigned long long x) +{ + if (__builtin_parityll(x)) + dummy(); +} + +void baz(unsigned long long x) +{ + if (!__builtin_parityll(x)) + dummy(); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-5.c b/gcc/testsuite/gcc.target/i386/parity-5.c new file mode 100644 index 00000000000..d19ed265b61 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-5.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2" } */ +/* { dg-final { scan-assembler "popcnt" } } */ +/* { dg-final { scan-assembler "and" } } */ + +int foo(unsigned int x) +{ + return __builtin_parity(x); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-6.c b/gcc/testsuite/gcc.target/i386/parity-6.c new file mode 100644 index 00000000000..33918e121d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-6.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2" } */ +/* { dg-final { scan-assembler "popcnt" } } */ +/* { dg-final { scan-assembler "and" } } */ + +int foo(unsigned long long x) +{ + return __builtin_parityll(x); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-7.c b/gcc/testsuite/gcc.target/i386/parity-7.c new file mode 100644 index 00000000000..ed9357f47db --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-7.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */ +/* { dg-additional-options "-mregparm=1" { target ia32 } } */ +/* { dg-final { scan-assembler-times "test" 2 } } */ +/* { dg-final { scan-assembler-not "shr" } } */ + +int foo(unsigned char x) +{ + return __builtin_parity(x); +} + +int bar(unsigned char x) +{ + return __builtin_parityll(x); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-8.c b/gcc/testsuite/gcc.target/i386/parity-8.c new file mode 100644 index 00000000000..a87845583ff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-8.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */ +/* { dg-final { scan-assembler-not "shr" } } */ + +int foo(unsigned short x) +{ + return __builtin_parity(x); +} + +int bar(unsigned short x) +{ + return __builtin_parityll(x); +} diff --git a/gcc/testsuite/gcc.target/i386/parity-9.c b/gcc/testsuite/gcc.target/i386/parity-9.c new file mode 100644 index 00000000000..9d6cfe26a24 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/parity-9.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=core-avx2" } */ +/* { dg-additional-options "-mregparm=1" { target ia32 } } */ +/* { dg-final { scan-assembler-not "popcnt" } } */ +/* { dg-final { scan-assembler-not "shr" } } */ +/* { dg-final { scan-assembler-times "jp" 2 } } */ +/* { dg-final { scan-assembler-times "jnp" 2 } } */ + +void dummy(void); + +void pos8(unsigned char x) +{ + if (__builtin_parity(x)) + dummy(); +} + +void neg8(unsigned char x) +{ + if (!__builtin_parity(x)) + dummy(); +} + +void pos16(unsigned short x) +{ + if (__builtin_parity(x)) + dummy(); +} + +void neg16(unsigned short x) +{ + if (!__builtin_parity(x)) + dummy(); +} -- 2.30.2