From d3da63e543ba6e01af109f0b9827cdc03ddf116e Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Fri, 13 Jan 2017 16:00:26 +0000
Subject: [PATCH] Avoid excessively-big hash tables in empty-add cycles

A big source of cache misses when compiling a recent version of
gimple-match.ii was the call to cv_cache.empty () in clear_cv_cache.
The problem was that at one early point the hash table had grown
to 8191 entries (128k on LP64 hosts).  It then stayed at that size
for the rest of the compilation, even though subsequent uses needed
only a small number of entries (usually fewer than ten).  We would
still clear the whole 128k each time clear_cv_cache was called.

empty() already looks for cases where the hash table is very big
and cuts it down.  At the moment it fires when the table is 1M
in size and reduces it to the next selected prime above 1K (so
almost 2K in practice).  One fix would have been to lower the
threshold, but that didn't feel like the right approach.  Reducing
the current limit of 1M by a factor of 8 would be pretty significant
on its own, but I think this cv_cache behaviour would have been a
problem even with 64k or 32k tables.

I think the existing check is really for cases in which even a
well-populated table would need to be shrunk rather than cleared.
Here the problem isn't that the table is excessively big in
absolute terms, more that one outlier has made the table much
too big for the general case.

traverse() already shrinks the table if it's "too empty",
which is taken to be if:

      no. elements * 8 < capacity && capacity > 32

So an alternative would be to apply the same test (and the same choice
of shrunken size) to empty_slow too.  The patch below does this.
It gives a 2.5% improvement in gimple-match.ii compile time at -O0 -g
and doesn't seem to adversely affect any other tests I've tried.

Of course, there's a theoretical risk of a table alternating between
one large element count and one small element count.  If there was a
factor of eight difference between the two, we could shrink the table
on seeing each small element count, then grow it again when adding the
large number of elements.  That seems pretty unlikely in practice
though.

Also, empty_slow() does involve a traversal if some form of manual
gc is needed on active elements, so trying to recover from an outlier
should have even more benefit there.  (cv_cache uses automatic gc and so
the traversal gets optimised away.)

The calculation of the existing 1M threshold was assuming that each
entry was pointer-sized.  This patch makes it use the actual size of the
entry instead.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/
	* hash-table.h (hash_table::too_empty_p): New function.
	(hash_table::expand): Use it.
	(hash_table::traverse): Likewise.
	(hash_table::empty_slot): Use sizeof (value_type) instead of
	sizeof (PTR) to convert bytes to elements.  Shrink the table
	if the current size is excessive for the current number of
	elements.

From-SVN: r244447
---
 gcc/ChangeLog    | 10 ++++++++++
 gcc/hash-table.h | 25 ++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 95d1d0ac008..b5f7b08f0a8 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,13 @@
+2017-01-13  Richard Sandiford  <richard.sandiford@arm.com>
+
+	* hash-table.h (hash_table::too_empty_p): New function.
+	(hash_table::expand): Use it.
+	(hash_table::traverse): Likewise.
+	(hash_table::empty_slot): Use sizeof (value_type) instead of
+	sizeof (PTR) to convert bytes to elements.  Shrink the table
+	if the current size is excessive for the current number of
+	elements.
+
 2017-01-13  Richard Sandiford  <richard.sandiford@arm.com>
 
 	* ira-costs.c (record_reg_classes): Break from the inner loop
diff --git a/gcc/hash-table.h b/gcc/hash-table.h
index 0bee093a645..0f7e21a2cc5 100644
--- a/gcc/hash-table.h
+++ b/gcc/hash-table.h
@@ -503,6 +503,7 @@ private:
 
   value_type *alloc_entries (size_t n CXX_MEM_STAT_INFO) const;
   value_type *find_empty_slot_for_expand (hashval_t);
+  bool too_empty_p (unsigned int);
   void expand ();
   static bool is_deleted (value_type &v)
   {
@@ -691,6 +692,15 @@ hash_table<Descriptor, Allocator>::find_empty_slot_for_expand (hashval_t hash)
     }
 }
 
+/* Return true if the current table is excessively big for ELTS elements.  */
+
+template<typename Descriptor, template<typename Type> class Allocator>
+inline bool
+hash_table<Descriptor, Allocator>::too_empty_p (unsigned int elts)
+{
+  return elts * 8 < m_size && m_size > 32;
+}
+
 /* The following function changes size of memory allocated for the
    entries and repeatedly inserts the table elements.  The occupancy
    of the table after the call will be about 50%.  Naturally the hash
@@ -712,7 +722,7 @@ hash_table<Descriptor, Allocator>::expand ()
      too full or too empty.  */
   unsigned int nindex;
   size_t nsize;
-  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
+  if (elts * 2 > osize || too_empty_p (elts))
     {
       nindex = hash_table_higher_prime_index (elts * 2);
       nsize = prime_tab[nindex].prime;
@@ -764,6 +774,7 @@ void
 hash_table<Descriptor, Allocator>::empty_slow ()
 {
   size_t size = m_size;
+  size_t nsize = size;
   value_type *entries = m_entries;
   int i;
 
@@ -772,9 +783,14 @@ hash_table<Descriptor, Allocator>::empty_slow ()
       Descriptor::remove (entries[i]);
 
   /* Instead of clearing megabyte, downsize the table.  */
-  if (size > 1024*1024 / sizeof (PTR))
+  if (size > 1024*1024 / sizeof (value_type))
+    nsize = 1024 / sizeof (value_type);
+  else if (too_empty_p (m_n_elements))
+    nsize = m_n_elements * 2;
+
+  if (nsize != size)
     {
-      int nindex = hash_table_higher_prime_index (1024 / sizeof (PTR));
+      int nindex = hash_table_higher_prime_index (nsize);
       int nsize = prime_tab[nindex].prime;
 
       if (!m_ggc)
@@ -965,8 +981,7 @@ template <typename Argument,
 void
 hash_table<Descriptor, Allocator>::traverse (Argument argument)
 {
-  size_t size = m_size;
-  if (elements () * 8 < size && size > 32)
+  if (too_empty_p (elements ()))
     expand ();
 
   traverse_noresize <Argument, Callback> (argument);
-- 
2.30.2