i386: Improve expansion of __builtin_parity
authorUros Bizjak <ubizjak@gmail.com>
Sun, 7 Jun 2020 20:07:28 +0000 (22:07 +0200)
committerUros Bizjak <ubizjak@gmail.com>
Sun, 7 Jun 2020 20:09:49 +0000 (22:09 +0200)
GCC currently hides the shift and xor reduction inside a backend
specific UNSPEC PARITY, making it invisible to the RTL optimizers until
very late during compilation.  It is normally reasonable for the
middle-end to maintain wider mode representations for as long as possible
and split them later, but this only helps if the semantics are visible
at the RTL-level (to combine and other passes), but UNSPECs are black
boxes, so in this case splitting early (during RTL expansion) is a
better strategy.

It turns out that that popcount instruction on modern x86_64 processors
has (almost) made the integer parity flag in the x86 ALU completely
obsolete, especially as POPCOUNT's integer semantics are a much better
fit to RTL.  The one remaining case where these transistors are useful
is where __builtin_parity is immediately tested by a conditional branch,
and therefore the result is wanted in a flags register rather than as
an integer.  This case is captured by two peephole2 optimizations in
the attached patch.

2020-06-07  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog:

* config/i386/i386.md (paritydi2, paritysi2): Expand reduction
via shift and xor to an USPEC PARITY matching a parityhi2_cmp.
(paritydi2_cmp, paritysi2_cmp): Delete these define_insn_and_split.
(parityhi2, parityqi2): New expanders.
(parityhi2_cmp): Implement set parity flag with xorb insn.
(parityqi2_cmp): Implement set parity flag with testb insn.
New peephole2s to use these insns (UNSPEC PARITY) when appropriate.

gcc/testsuite/ChangeLog:

* gcc.target/i386/parity-3.c: New test.
* gcc.target/i386/parity-4.c: Likewise.
* gcc.target/i386/parity-5.c: Likewise.
* gcc.target/i386/parity-6.c: Likewise.
* gcc.target/i386/parity-7.c: Likewise.
* gcc.target/i386/parity-8.c: Likewise.
* gcc.target/i386/parity-9.c: Likewise.

gcc/config/i386/i386.md
gcc/testsuite/gcc.target/i386/parity-3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-5.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-6.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-7.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-8.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/parity-9.c [new file with mode: 0644]

index a8592a95fe1f152515816a356c6373244a7574a7..9db7469dfcc08a1ca01ea3938a0f75d6b8327eef 100644 (file)
   "! TARGET_POPCNT"
 {
   rtx scratch = gen_reg_rtx (QImode);
+  rtx hipart1 = gen_reg_rtx (SImode);
+  rtx lopart1 = gen_reg_rtx (SImode);
+  rtx xor1 = gen_reg_rtx (SImode);
+  rtx shift2 = gen_reg_rtx (SImode);
+  rtx hipart2 = gen_reg_rtx (HImode);
+  rtx lopart2 = gen_reg_rtx (HImode);
+  rtx xor2 = gen_reg_rtx (HImode);
 
-  emit_insn (gen_paritydi2_cmp (NULL_RTX, NULL_RTX,
-                               NULL_RTX, operands[1]));
+  if (TARGET_64BIT)
+    {
+      rtx shift1 = gen_reg_rtx (DImode);
+      emit_insn (gen_lshrdi3 (shift1, operands[1], GEN_INT (32)));
+      emit_move_insn (hipart1, gen_lowpart (SImode, shift1));
+    }
+  else
+    emit_move_insn (hipart1, gen_highpart (SImode, operands[1]));
+
+  emit_move_insn (lopart1, gen_lowpart (SImode, operands[1]));
+  emit_insn (gen_xorsi3 (xor1, hipart1, lopart1));
+
+  emit_insn (gen_lshrsi3 (shift2, xor1, GEN_INT (16)));
+  emit_move_insn (hipart2, gen_lowpart (HImode, shift2));
+  emit_move_insn (lopart2, gen_lowpart (HImode, xor1));
+  emit_insn (gen_xorhi3 (xor2, hipart2, lopart2));
+
+  emit_insn (gen_parityhi2_cmp (xor2));
 
   ix86_expand_setcc (scratch, ORDERED,
                     gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
   "! TARGET_POPCNT"
 {
   rtx scratch = gen_reg_rtx (QImode);
+  rtx shift = gen_reg_rtx (SImode);
+  rtx hipart = gen_reg_rtx (HImode);
+  rtx lopart = gen_reg_rtx (HImode);
+  rtx tmp = gen_reg_rtx (HImode);
+
+  emit_insn (gen_lshrsi3 (shift, operands[1], GEN_INT (16)));
+  emit_move_insn (hipart, gen_lowpart (HImode, shift));
+  emit_move_insn (lopart, gen_lowpart (HImode, operands[1]));
+  emit_insn (gen_xorhi3 (tmp, hipart, lopart));
 
-  emit_insn (gen_paritysi2_cmp (NULL_RTX, NULL_RTX, operands[1]));
+  emit_insn (gen_parityhi2_cmp (tmp));
 
   ix86_expand_setcc (scratch, ORDERED,
                     gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
   DONE;
 })
 
-(define_insn_and_split "paritydi2_cmp"
-  [(set (reg:CC FLAGS_REG)
-       (unspec:CC [(match_operand:DI 3 "register_operand" "0")]
-                  UNSPEC_PARITY))
-   (clobber (match_scratch:DI 0 "=r"))
-   (clobber (match_scratch:SI 1 "=&r"))
-   (clobber (match_scratch:HI 2 "=Q"))]
+(define_expand "parityhi2"
+  [(set (match_operand:HI 0 "register_operand")
+       (parity:HI (match_operand:HI 1 "register_operand")))]
   "! TARGET_POPCNT"
-  "#"
-  "&& reload_completed"
-  [(parallel
-     [(set (match_dup 1)
-          (xor:SI (match_dup 1) (match_dup 4)))
-      (clobber (reg:CC FLAGS_REG))])
-   (parallel
-     [(set (reg:CC FLAGS_REG)
-          (unspec:CC [(match_dup 1)] UNSPEC_PARITY))
-      (clobber (match_dup 1))
-      (clobber (match_dup 2))])]
 {
-  operands[4] = gen_lowpart (SImode, operands[3]);
+  rtx scratch = gen_reg_rtx (QImode);
 
-  if (TARGET_64BIT)
-    {
-      emit_move_insn (operands[1], gen_lowpart (SImode, operands[3]));
-      emit_insn (gen_lshrdi3 (operands[3], operands[3], GEN_INT (32)));
-    }
-  else
-    operands[1] = gen_highpart (SImode, operands[3]);
+  emit_insn (gen_parityhi2_cmp (operands[1]));
+
+  ix86_expand_setcc (scratch, ORDERED,
+                    gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
+
+  emit_insn (gen_zero_extendqihi2 (operands[0], scratch));
+  DONE;
 })
 
-(define_insn_and_split "paritysi2_cmp"
-  [(set (reg:CC FLAGS_REG)
-       (unspec:CC [(match_operand:SI 2 "register_operand" "0")]
-                  UNSPEC_PARITY))
-   (clobber (match_scratch:SI 0 "=r"))
-   (clobber (match_scratch:HI 1 "=&Q"))]
+(define_expand "parityqi2"
+  [(set (match_operand:QI 0 "register_operand")
+       (parity:QI (match_operand:QI 1 "register_operand")))]
   "! TARGET_POPCNT"
-  "#"
-  "&& reload_completed"
-  [(parallel
-     [(set (match_dup 1)
-          (xor:HI (match_dup 1) (match_dup 3)))
-      (clobber (reg:CC FLAGS_REG))])
-   (parallel
-     [(set (reg:CC FLAGS_REG)
-          (unspec:CC [(match_dup 1)] UNSPEC_PARITY))
-      (clobber (match_dup 1))])]
 {
-  operands[3] = gen_lowpart (HImode, operands[2]);
+  emit_insn (gen_parityqi2_cmp (operands[1]));
 
-  emit_move_insn (operands[1], gen_lowpart (HImode, operands[2]));
-  emit_insn (gen_lshrsi3 (operands[2], operands[2], GEN_INT (16)));
+  ix86_expand_setcc (operands[0], ORDERED,
+                    gen_rtx_REG (CCmode, FLAGS_REG), const0_rtx);
+  DONE;
 })
 
-(define_insn "*parityhi2_cmp"
+(define_insn "parityhi2_cmp"
   [(set (reg:CC FLAGS_REG)
-       (unspec:CC [(match_operand:HI 1 "register_operand" "0")]
+       (unspec:CC [(match_operand:HI 0 "register_operand" "+Q")]
                   UNSPEC_PARITY))
-   (clobber (match_scratch:HI 0 "=Q"))]
-  "! TARGET_POPCNT"
+   (clobber (match_dup 0))]
+  ""
   "xor{b}\t{%h0, %b0|%b0, %h0}"
   [(set_attr "length" "2")
-   (set_attr "mode" "HI")])
+   (set_attr "mode" "QI")])
+
+(define_insn "parityqi2_cmp"
+  [(set (reg:CC FLAGS_REG)
+       (unspec:CC [(match_operand:QI 0 "register_operand" "q")]
+                  UNSPEC_PARITY))]
+  ""
+  "test{b}\t%0, %0"
+  [(set_attr "mode" "QI")])
+
+;; Replace zero_extend:HI followed by parityhi2_cmp with parityqi2_cmp
+(define_peephole2
+  [(set (match_operand:HI 0 "register_operand")
+       (zero_extend:HI (match_operand:QI 1 "register_operand")))
+   (parallel [(set (reg:CC FLAGS_REG)
+                  (unspec:CC [(match_dup 0)] UNSPEC_PARITY))
+             (clobber (match_dup 0))])]
+  ""
+  [(set (reg:CC FLAGS_REG)
+       (unspec:CC [(match_dup 1)] UNSPEC_PARITY))])
+
+;; Eliminate QImode popcount&1 using parity flag
+(define_peephole2
+  [(set (match_operand:SI 0 "register_operand")
+       (zero_extend:SI (match_operand:QI 1 "register_operand")))
+   (parallel [(set (match_operand:SI 2 "register_operand")
+                  (popcount:SI (match_dup 0)))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (reg:CCZ FLAGS_REG)
+       (compare:CCZ (and:QI (match_operand:QI 3 "register_operand")
+                            (const_int 1))
+                    (const_int 0)))
+   (set (pc) (if_then_else (match_operator 4 "bt_comparison_operator"
+                           [(reg:CCZ FLAGS_REG)
+                            (const_int 0)])
+                          (label_ref (match_operand 5))
+                          (pc)))]
+  "REGNO (operands[2]) == REGNO (operands[3])
+   && peep2_reg_dead_p (3, operands[0])
+   && peep2_reg_dead_p (3, operands[2])
+   && peep2_regno_dead_p (4, FLAGS_REG)"
+  [(set (reg:CC FLAGS_REG)
+       (unspec:CC [(match_dup 1)] UNSPEC_PARITY))
+   (set (pc) (if_then_else (match_op_dup 4 [(reg:CC FLAGS_REG)
+                                           (const_int 0)])
+                          (label_ref (match_dup 5))
+                          (pc)))]
+{
+  operands[4] = shallow_copy_rtx (operands[4]);
+  PUT_CODE (operands[4], GET_CODE (operands[4]) == EQ ? UNORDERED : ORDERED);
+})
+
+;; Eliminate HImode popcount&1 using parity flag
+(define_peephole2
+  [(match_scratch:HI 0 "Q")
+   (parallel [(set (match_operand:HI 1 "register_operand")
+                  (popcount:HI
+                   (match_operand:HI 2 "nonimmediate_operand")))
+             (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand 3 "register_operand")
+       (zero_extend (match_dup 1)))
+   (set (reg:CCZ FLAGS_REG)
+       (compare:CCZ (and:QI (match_operand:QI 4 "register_operand")
+                            (const_int 1))
+                    (const_int 0)))
+   (set (pc) (if_then_else (match_operator 5 "bt_comparison_operator"
+                           [(reg:CCZ FLAGS_REG)
+                            (const_int 0)])
+                          (label_ref (match_operand 6))
+                          (pc)))]
+  "REGNO (operands[3]) == REGNO (operands[4])
+   && peep2_reg_dead_p (3, operands[1])
+   && peep2_reg_dead_p (3, operands[3])
+   && peep2_regno_dead_p (4, FLAGS_REG)"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel [(set (reg:CC FLAGS_REG)
+                  (unspec:CC [(match_dup 0)] UNSPEC_PARITY))
+             (clobber (match_dup 0))])
+   (set (pc) (if_then_else (match_op_dup 5 [(reg:CC FLAGS_REG)
+                                           (const_int 0)])
+                          (label_ref (match_dup 6))
+                          (pc)))]
+{
+  operands[5] = shallow_copy_rtx (operands[5]);
+  PUT_CODE (operands[5], GET_CODE (operands[5]) == EQ ? UNORDERED : ORDERED);
+})
 
 \f
 ;; Thread-local storage patterns for ELF.
diff --git a/gcc/testsuite/gcc.target/i386/parity-3.c b/gcc/testsuite/gcc.target/i386/parity-3.c
new file mode 100644 (file)
index 0000000..6b72591
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
+/* { dg-final { scan-assembler "setp" } } */
+/* { dg-final { scan-assembler "jnp" } } */
+/* { dg-final { scan-assembler "jp" } } */
+
+void dummy(void);
+
+int foo(unsigned int x)
+{
+  return !__builtin_parity(x);
+}
+
+void bar(unsigned int x)
+{
+  if (__builtin_parity(x))
+    dummy();
+}
+
+void baz(unsigned int x)
+{
+  if (!__builtin_parity(x))
+    dummy();
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-4.c b/gcc/testsuite/gcc.target/i386/parity-4.c
new file mode 100644 (file)
index 0000000..48384c2
--- /dev/null
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
+/* { dg-final { scan-assembler "setp" } } */
+/* { dg-final { scan-assembler "jnp" } } */
+/* { dg-final { scan-assembler "jp" } } */
+
+void dummy(void);
+
+int foo(unsigned long long x)
+{
+  return !__builtin_parityll(x);
+}
+
+void bar(unsigned long long x)
+{
+  if (__builtin_parityll(x))
+    dummy();
+}
+
+void baz(unsigned long long x)
+{
+  if (!__builtin_parityll(x))
+    dummy();
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-5.c b/gcc/testsuite/gcc.target/i386/parity-5.c
new file mode 100644 (file)
index 0000000..d19ed26
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2" } */
+/* { dg-final { scan-assembler "popcnt" } } */
+/* { dg-final { scan-assembler "and" } } */
+
+int foo(unsigned int x)
+{
+  return __builtin_parity(x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-6.c b/gcc/testsuite/gcc.target/i386/parity-6.c
new file mode 100644 (file)
index 0000000..33918e1
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2" } */
+/* { dg-final { scan-assembler "popcnt" } } */
+/* { dg-final { scan-assembler "and" } } */
+
+int foo(unsigned long long x)
+{
+  return __builtin_parityll(x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-7.c b/gcc/testsuite/gcc.target/i386/parity-7.c
new file mode 100644 (file)
index 0000000..ed9357f
--- /dev/null
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
+/* { dg-additional-options "-mregparm=1" { target ia32 } } */
+/* { dg-final { scan-assembler-times "test" 2 } } */
+/* { dg-final { scan-assembler-not "shr" } } */
+
+int foo(unsigned char x)
+{
+  return __builtin_parity(x);
+}
+
+int bar(unsigned char x)
+{
+  return __builtin_parityll(x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-8.c b/gcc/testsuite/gcc.target/i386/parity-8.c
new file mode 100644 (file)
index 0000000..a878455
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2 -mno-popcnt" } */
+/* { dg-final { scan-assembler-not "shr" } } */
+
+int foo(unsigned short x)
+{
+  return __builtin_parity(x);
+}
+
+int bar(unsigned short x)
+{
+  return __builtin_parityll(x);
+}
diff --git a/gcc/testsuite/gcc.target/i386/parity-9.c b/gcc/testsuite/gcc.target/i386/parity-9.c
new file mode 100644 (file)
index 0000000..9d6cfe2
--- /dev/null
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=core-avx2" } */
+/* { dg-additional-options "-mregparm=1" { target ia32 } } */
+/* { dg-final { scan-assembler-not "popcnt" } } */
+/* { dg-final { scan-assembler-not "shr" } } */
+/* { dg-final { scan-assembler-times "jp" 2 } } */
+/* { dg-final { scan-assembler-times "jnp" 2 } } */
+
+void dummy(void);
+
+void pos8(unsigned char x)
+{
+  if (__builtin_parity(x))
+    dummy();
+}
+
+void neg8(unsigned char x)
+{
+  if (!__builtin_parity(x))
+    dummy();
+}
+
+void pos16(unsigned short x)
+{
+  if (__builtin_parity(x))
+    dummy();
+}
+
+void neg16(unsigned short x)
+{
+  if (!__builtin_parity(x))
+    dummy();
+}