From: Jakub Jelinek <jakub@redhat.com>
Date: Mon, 30 Nov 2020 09:55:43 +0000 (+0100)
Subject: expansion: Improve double-word modulo by certain constant divisors [PR97459]
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4d87bd39bafae86747944b2f8c53fdbc43b8dac3;p=gcc.git

expansion: Improve double-word modulo by certain constant divisors [PR97459]

As discussed in the PR, e.g. on x86_64 (both -m32 and -m64) there is no
double-word modulo and so we expand it to a __{,u}mod[dt]i3 call.
For certain constant divisors we can do better.  E.g. consider
32-bit word-size, 0x100000000ULL % 3 == 1, so we can use partly the Hacker's
delight modulo by summing digits approach and optimize
unsigned long long foo (unsigned long long x) { return x % 3; }
as
unsigned long long foo (unsigned long long x) {
  unsigned int sum, carry;
  carry = __builtin_add_overflow ((unsigned int) x, (unsigned int) (x >> 32), &sum);
  sum += carry;
  return sum % 3;
}
Similarly, 0x10000000ULL % 5 == 1 (note, 1 << 28), so
unsigned long long bar (unsigned long long x) { return x % 5; }
as
unsigned long long bar (unsigned long long x) {
  unsigned int sum = x & ((1 << 28) - 1);
  sum += (x >> 28) & ((1 << 28) - 1);
  sum += (x >> 56);
  return sum % 5;
}
etc.
And we can do also signed modulo,
long long baz (long long x) { return x % 5; }
as
long long baz (long long x) {
  unsigned int sum = x & ((1 << 28) - 1);
  sum += ((unsigned long long) x >> 28) & ((1 << 28) - 1);
  sum += ((unsigned long long) x >> 56);
  /* Sum adjustment for negative x.  */
  sum += (x >> 63) & 3;
  unsigned int rem = sum % 5;
  /* And finally adjust it to the right interval for negative values.  */
  return (int) (rem + ((x >> 63) & -4));
}

2020-11-30  Jakub Jelinek  <jakub@redhat.com>

	PR rtl-optimization/97459
	* internal-fn.h (expand_addsub_overflow): Declare.
	* internal-fn.c (expand_addsub_overflow): No longer static.
	* optabs.c (expand_doubleword_mod): New function.
	(expand_binop): Optimize double-word mod with constant divisor.

	* gcc.dg/pr97459-1.c: New test.
	* gcc.dg/pr97459-2.c: New test.
---

diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 3a979f144de..9c4fd1ca35a 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -798,7 +798,7 @@ expand_ubsan_result_store (rtx target, rtx res)
 /* Add sub/add overflow checking to the statement STMT.
    CODE says whether the operation is +, or -.  */
 
-static void
+void
 expand_addsub_overflow (location_t loc, tree_code code, tree lhs,
 			tree arg0, tree arg1, bool unsr_p, bool uns0_p,
 			bool uns1_p, bool is_ubsan, tree *datap)
diff --git a/gcc/internal-fn.h b/gcc/internal-fn.h
index ac970e9e82d..2a96b2b9216 100644
--- a/gcc/internal-fn.h
+++ b/gcc/internal-fn.h
@@ -224,6 +224,8 @@ extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
 extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
 						poly_uint64, unsigned int);
 
+extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
+				    bool, bool, bool, bool, tree *);
 extern void expand_internal_call (gcall *);
 extern void expand_internal_call (internal_fn, gcall *);
 extern void expand_PHI (internal_fn, gcall *);
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 1820b91877a..3d5b07d95ce 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -44,6 +44,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "expr.h"
 #include "optabs-tree.h"
 #include "libfuncs.h"
+#include "internal-fn.h"
+#include "langhooks.h"
 
 static void prepare_float_lib_cmp (rtx, rtx, enum rtx_code, rtx *,
 				   machine_mode *);
@@ -926,6 +928,196 @@ expand_doubleword_mult (machine_mode mode, rtx op0, rtx op1, rtx target,
   emit_move_insn (product_high, adjust);
   return product;
 }
+
+/* Subroutine of expand_binop.  Optimize unsigned double-word OP0 % OP1 for
+   constant OP1.  If for some bit in [BITS_PER_WORD / 2, BITS_PER_WORD] range
+   (prefer higher bits) ((1w << bit) % OP1) == 1, then the modulo can be
+   computed in word-mode as ((OP0 & (bit - 1)) + ((OP0 >> bit) & (bit - 1))
+   + (OP0 >> (2 * bit))) % OP1.  Whether we need to sum 2, 3 or 4 values
+   depends on the bit value, if 2, then carry from the addition needs to be
+   added too, i.e. like:
+   sum += __builtin_add_overflow (low, high, &sum)
+
+   Optimize signed double-word OP0 % OP1 similarly, just apply some correction
+   factor to the sum before doing unsigned remainder, in the form of
+   sum += (((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & const);
+   then perform unsigned
+   remainder = sum % OP1;
+   and finally
+   remainder += ((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1);  */
+
+static rtx
+expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
+{
+  if (INTVAL (op1) <= 1)
+    return NULL_RTX;
+
+  rtx_insn *last = get_last_insn ();
+  for (int bit = BITS_PER_WORD; bit >= BITS_PER_WORD / 2; bit--)
+    {
+      wide_int w = wi::shifted_mask (bit, 1, false, 2 * BITS_PER_WORD);
+      if (wi::ne_p (wi::umod_trunc (w, INTVAL (op1)), 1))
+	continue;
+      rtx sum = NULL_RTX, mask = NULL_RTX;
+      if (bit == BITS_PER_WORD)
+	{
+	  /* For signed modulo we need to add correction to the sum
+	     and that might again overflow.  */
+	  if (!unsignedp)
+	    continue;
+	  if (optab_handler (uaddv4_optab, word_mode) == CODE_FOR_nothing)
+	    continue;
+	  tree wtype = lang_hooks.types.type_for_mode (word_mode, 1);
+	  if (wtype == NULL_TREE)
+	    continue;
+	  tree ctype = build_complex_type (wtype);
+	  if (TYPE_MODE (ctype) != GET_MODE_COMPLEX_MODE (word_mode))
+	    continue;
+	  machine_mode cmode = TYPE_MODE (ctype);
+	  rtx op00 = operand_subword_force (op0, 0, mode);
+	  rtx op01 = operand_subword_force (op0, 1, mode);
+	  rtx cres = gen_rtx_CONCAT (cmode, gen_reg_rtx (word_mode),
+				     gen_reg_rtx (word_mode));
+	  tree lhs = make_tree (ctype, cres);
+	  tree arg0 = make_tree (wtype, op00);
+	  tree arg1 = make_tree (wtype, op01);
+	  expand_addsub_overflow (UNKNOWN_LOCATION, PLUS_EXPR, lhs, arg0,
+				  arg1, true, true, true, false, NULL);
+	  sum = expand_simple_binop (word_mode, PLUS, XEXP (cres, 0),
+				     XEXP (cres, 1), NULL_RTX, 1,
+				     OPTAB_DIRECT);
+	  if (sum == NULL_RTX)
+	    return NULL_RTX;
+	}
+      else
+	{
+	  /* Code below uses GEN_INT, so we need the masks to be representable
+	     in HOST_WIDE_INTs.  */
+	  if (bit >= HOST_BITS_PER_WIDE_INT)
+	    continue;
+	  /* If op0 is e.g. -1 or -2 unsigned, then the 2 additions might
+	     overflow.  Consider 64-bit -1ULL for word size 32, if we add
+	     0x7fffffffU + 0x7fffffffU + 3U, it wraps around to 1.  */
+	  if (bit == BITS_PER_WORD - 1)
+	    continue;
+
+	  int count = (2 * BITS_PER_WORD + bit - 1) / bit;
+	  rtx sum_corr = NULL_RTX;
+
+	  if (!unsignedp)
+	    {
+	      /* For signed modulo, compute it as unsigned modulo of
+		 sum with a correction added to it if OP0 is negative,
+		 such that the result can be computed as unsigned
+		 remainder + ((OP1 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1).  */
+	      w = wi::min_value (2 * BITS_PER_WORD, SIGNED);
+	      wide_int wmod1 = wi::umod_trunc (w, INTVAL (op1));
+	      wide_int wmod2 = wi::smod_trunc (w, INTVAL (op1));
+	      /* wmod2 == -wmod1.  */
+	      wmod2 = wmod2 + (INTVAL (op1) - 1);
+	      if (wi::ne_p (wmod1, wmod2))
+		{
+		  wide_int wcorr = wmod2 - wmod1;
+		  if (wi::neg_p (w))
+		    wcorr = wcorr + INTVAL (op1);
+		  /* Now verify if the count sums can't overflow, and punt
+		     if they could.  */
+		  w = wi::mask (bit, false, 2 * BITS_PER_WORD);
+		  w = w * (count - 1);
+		  w = w + wi::mask (2 * BITS_PER_WORD - (count - 1) * bit,
+				    false, 2 * BITS_PER_WORD);
+		  w = w + wcorr;
+		  w = wi::lrshift (w, BITS_PER_WORD);
+		  if (wi::ne_p (w, 0))
+		    continue;
+
+		  mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
+						mode);
+		  mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
+					      GEN_INT (BITS_PER_WORD - 1),
+					      NULL_RTX, 0, OPTAB_DIRECT);
+		  if (mask == NULL_RTX)
+		    return NULL_RTX;
+		  sum_corr = immed_wide_int_const (wcorr, word_mode);
+		  sum_corr = expand_simple_binop (word_mode, AND, mask,
+						  sum_corr, NULL_RTX, 1,
+						  OPTAB_DIRECT);
+		  if (sum_corr == NULL_RTX)
+		    return NULL_RTX;
+		}
+	    }
+
+	  for (int i = 0; i < count; i++)
+	    {
+	      rtx v = op0;
+	      if (i)
+		v = expand_simple_binop (mode, LSHIFTRT, v, GEN_INT (i * bit),
+					 NULL_RTX, 1, OPTAB_DIRECT);
+	      if (v == NULL_RTX)
+		return NULL_RTX;
+	      v = lowpart_subreg (word_mode, v, mode);
+	      if (v == NULL_RTX)
+		return NULL_RTX;
+	      if (i != count - 1)
+		v = expand_simple_binop (word_mode, AND, v,
+					 GEN_INT ((HOST_WIDE_INT_1U << bit)
+						  - 1), NULL_RTX, 1,
+					 OPTAB_DIRECT);
+	      if (v == NULL_RTX)
+		return NULL_RTX;
+	      if (sum == NULL_RTX)
+		sum = v;
+	      else
+		sum = expand_simple_binop (word_mode, PLUS, sum, v, NULL_RTX,
+					   1, OPTAB_DIRECT);
+	      if (sum == NULL_RTX)
+		return NULL_RTX;
+	    }
+	  if (sum_corr)
+	    {
+	      sum = expand_simple_binop (word_mode, PLUS, sum, sum_corr,
+					 NULL_RTX, 1, OPTAB_DIRECT);
+	      if (sum == NULL_RTX)
+		return NULL_RTX;
+	    }
+	}
+      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum, op1,
+				     NULL_RTX, 1);
+      if (remainder == NULL_RTX)
+	return NULL_RTX;
+
+      if (!unsignedp)
+	{
+	  if (mask == NULL_RTX)
+	    {
+	      mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
+					    mode);
+	      mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
+					  GEN_INT (BITS_PER_WORD - 1),
+					  NULL_RTX, 0, OPTAB_DIRECT);
+	      if (mask == NULL_RTX)
+		return NULL_RTX;
+	    }
+	  mask = expand_simple_binop (word_mode, AND, mask,
+				      GEN_INT (1 - INTVAL (op1)),
+				      NULL_RTX, 1, OPTAB_DIRECT);
+	  if (mask == NULL_RTX)
+	    return NULL_RTX;
+	  remainder = expand_simple_binop (word_mode, PLUS, remainder,
+					   mask, NULL_RTX, 1, OPTAB_DIRECT);
+	  if (remainder == NULL_RTX)
+	    return NULL_RTX;
+	}
+
+      remainder = convert_modes (mode, word_mode, remainder, unsignedp);
+      /* Punt if we need any library calls.  */
+      for (; last; last = NEXT_INSN (last))
+	if (CALL_P (last))
+	  return NULL_RTX;
+      return remainder;
+    }
+  return NULL_RTX;
+}
 
 /* Wrapper around expand_binop which takes an rtx code to specify
    the operation to perform, not an optab pointer.  All other
@@ -1806,6 +1998,37 @@ expand_binop (machine_mode mode, optab binoptab, rtx op0, rtx op1,
 	}
     }
 
+  /* Attempt to synthetize double word modulo by constant divisor.  */
+  if ((binoptab == umod_optab || binoptab == smod_optab)
+      && optimize
+      && CONST_INT_P (op1)
+      && is_int_mode (mode, &int_mode)
+      && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
+      && optab_handler (lshr_optab, int_mode) != CODE_FOR_nothing
+      && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
+      && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
+      && optimize_insn_for_speed_p ())
+    {
+      rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
+					     binoptab == umod_optab);
+      if (remainder != NULL_RTX)
+	{
+	  if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
+	    {
+	      rtx_insn *move = emit_move_insn (target ? target : remainder,
+					       remainder);
+	      set_dst_reg_note (move,
+				REG_EQUAL,
+				gen_rtx_fmt_ee (UMOD, int_mode,
+						copy_rtx (op0), op1),
+				target ? target : remainder);
+	    }
+	  return remainder;
+	}
+      else
+	delete_insns_since (last);
+    }
+
   /* It can't be open-coded in this mode.
      Use a library call if one is available and caller says that's ok.  */
 
diff --git a/gcc/testsuite/gcc.dg/pr97459-1.c b/gcc/testsuite/gcc.dg/pr97459-1.c
new file mode 100644
index 00000000000..3dcbb1dbea2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr97459-1.c
@@ -0,0 +1,54 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __uint128_t T;
+#else
+typedef unsigned long long T;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x % n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  T x = ((T) 1 << i) + j;
+	  if (foo (x, tests[k].x) != tests[k].foo (x))
+	    __builtin_abort ();
+	}
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/pr97459-2.c b/gcc/testsuite/gcc.dg/pr97459-2.c
new file mode 100644
index 00000000000..83e00cb1f55
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr97459-2.c
@@ -0,0 +1,57 @@
+/* PR rtl-optimization/97459 */
+/* { dg-do run } */
+/* { dg-options "-O2" } */
+/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
+
+#ifdef __SIZEOF_INT128__
+typedef __int128_t T;
+typedef __uint128_t U;
+#else
+typedef long long T;
+typedef unsigned long long U;
+#endif
+
+T __attribute__((noipa)) foo (T x, T n) { return x % n; }
+#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); }
+
+#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
+#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
+	      C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
+#ifdef EXPENSIVE
+#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
+	      C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
+#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
+	      C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
+#else
+#define C3(n) C2(n##0) C2(n##4) C2(n##9)
+#define C4(n) C3(n##0) C3(n##3) C3(n##7)
+#endif
+#define TESTS C4(1)
+
+TESTS
+
+struct S { T x; T (*foo) (T); };
+
+#undef C
+#define C(n) { n - 10000, foo##n },
+
+struct S tests[] = {
+TESTS
+  { 0, 0 }
+};
+
+int
+main ()
+{
+  int i, j, k;
+  for (k = 0; tests[k].x; k++)
+    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
+      for (j = -5; j <= 5; j++)
+	{
+	  U x = ((U) 1 << i) + j;
+	  if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
+	      || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
+	    __builtin_abort ();
+	}
+  return 0;
+}