re PR libstdc++/35588 ([parallel mode] parallel std::sort and bind())
authorJohannes Singler <singler@ira.uka.de>
Mon, 7 Apr 2008 08:27:34 +0000 (08:27 +0000)
committerJohannes Singler <singler@gcc.gnu.org>
Mon, 7 Apr 2008 08:27:34 +0000 (08:27 +0000)
2008-04-07  Johannes Singler  <singler@ira.uka.de>

         * include/parallel/multiway_merge.h:
           Moved decisions to compiletime instead of runtime.
         * include/parallel/losertree.h:
           Removed obsolete variants, added variant that uses pointers
           in the loser tree.
         * include/parallel/types.h:
           Remove obsolete settings options from enum.
         * include/parallel/features.h:
           Remove obsolete compile-time switches.
         * include/parallel/compiletime_settings.h:
           Remove obsolete variant that copies back *after* sorting.
         * include/parallel/tags.h:
           Add one new tag for compile-time switch.
         * include/parallel/merge.h:
           Adapt to changes in multiway_merge.h.
         * include/parallel/multiway_mergesort.h:
           Adapt to changes in multiway_merge.h.
           Factor out splitting variants.
           Remove obsolete variant that copies back *after* sorting.
         * include/parallel/sort.h:
           Adapt to changes in multiway_mergesort.h.
         * testsuite/25_algorithms/sort/35588.cc:
           Added test case from / for PR 35588.

From-SVN: r133975

libstdc++-v3/ChangeLog
libstdc++-v3/include/parallel/compiletime_settings.h
libstdc++-v3/include/parallel/features.h
libstdc++-v3/include/parallel/losertree.h
libstdc++-v3/include/parallel/merge.h
libstdc++-v3/include/parallel/multiway_merge.h
libstdc++-v3/include/parallel/multiway_mergesort.h
libstdc++-v3/include/parallel/sort.h
libstdc++-v3/include/parallel/tags.h
libstdc++-v3/include/parallel/types.h
libstdc++-v3/testsuite/25_algorithms/sort/35588.cc [new file with mode: 0644]

index f59007405603ba1e4756be98d2f26b6f9050b7c0..e452aaab6a82c319855fd28433234f48b4e79ef5 100644 (file)
@@ -1,3 +1,29 @@
+2008-04-07  Johannes Singler  <singler@ira.uka.de>
+
+         * include/parallel/multiway_merge.h:
+           Moved decisions to compiletime instead of runtime.
+         * include/parallel/losertree.h:
+           Removed obsolete variants, added variant that uses pointers
+           in the loser tree.
+         * include/parallel/types.h:
+           Remove obsolete settings options from enum.
+         * include/parallel/features.h:
+           Remove obsolete compile-time switches.
+         * include/parallel/compiletime_settings.h:
+           Remove obsolete variant that copies back *after* sorting.
+         * include/parallel/tags.h:
+           Add one new tag for compile-time switch.
+         * include/parallel/merge.h:
+           Adapt to changes in multiway_merge.h.
+         * include/parallel/multiway_mergesort.h:
+           Adapt to changes in multiway_merge.h.
+           Factor out splitting variants.
+           Remove obsolete variant that copies back *after* sorting.
+         * include/parallel/sort.h:
+           Adapt to changes in multiway_mergesort.h.
+         * testsuite/25_algorithms/sort/35588.cc:
+           Added test case from / for PR 35588. 
+
 2008-03-29  Paolo Carlini  <pcarlini@suse.de>
 
        PR libstdc++/35725
index edaea3856ad4f5dec7614eb4dd7057f1dad9e22d..8ab89aa8ee94edd6dd4a654ca06eb139e652dafa 100644 (file)
  *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 0
 #endif
-#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 
+#ifndef _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
 /** @brief Switch on many _GLIBCXX_PARALLEL_ASSERTions in parallel code.
  *  Consider the size of the TLB for
  *  __gnu_parallel::parallel_random_shuffle(). */
 #define _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB 0
 #endif
-
-#ifndef _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-/** @brief First copy the data, sort it locally, and merge it back
- * (0); or copy it back after everything is done (1).
- *
- *  Recommendation: 0 */
-#define _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST 0
-#endif
index 2e09980405ed9a6873cb663b0939c348c53137bb..7150c20affcd9c620f94ec77fda36d00fac4aec4 100644 (file)
 #define _GLIBCXX_BAL_QUICKSORT 1
 #endif
 
-#ifndef _GLIBCXX_LOSER_TREE
-/** @def _GLIBCXX_LOSER_TREE
- *  @brief Include guarded (sequences may run empty) loser tree,
- *  moving objects.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_EXPLICIT
-/** @def _GLIBCXX_LOSER_TREE_EXPLICIT
- *  @brief Include standard loser tree, storing two flags for infimum
- *  and supremum.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_EXPLICIT 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_REFERENCE
-/** @def _GLIBCXX_LOSER_TREE_REFERENCE
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_REFERENCE 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_POINTER
-/** @def _GLIBCXX_LOSER_TREE_POINTER
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_POINTER 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_UNGUARDED
-/** @def _GLIBCXX_LOSER_TREE_UNGUARDED
- *  @brief Include unguarded (sequences must not run empty) loser
- *  tree, moving objects.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_UNGUARDED 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
-/** @def _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED 1
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_COMBINED
-/** @def _GLIBCXX_LOSER_TREE_COMBINED
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_COMBINED 0
-#endif
-
-#ifndef _GLIBCXX_LOSER_TREE_SENTINEL
-/** @def _GLIBCXX_LOSER_TREE_SENTINEL
- *  @brief Include some loser tree variant.
- *  @see __gnu_parallel::_Settings multiway_merge_algorithm */
-#define _GLIBCXX_LOSER_TREE_SENTINEL 0
-#endif
-
-
 #ifndef _GLIBCXX_FIND_GROWING_BLOCKS
 /** @brief Include the growing blocks variant for std::find.
  *  @see __gnu_parallel::_Settings::find_algorithm */
index ddeb0d36d6ccfcb3ee4c87895ab3263608baa22f..cae15c0826eca6d6b33f4c4624aaf3a73a336bda 100644 (file)
 namespace __gnu_parallel
 {
 
-#if _GLIBCXX_LOSER_TREE_EXPLICIT
-
-/** @brief Guarded loser tree, copying the whole element into the
-* tree structure.
-*
-*  Guarding is done explicitly through two flags per element, inf
-*  and sup This is a quite slow variant.
-*/
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTreeExplicit
+/**
+ * @brief Guarded loser/tournament tree.
+ *
+ * The smallest element is at the top.
+ *
+ * Guarding is done explicitly through one flag sup per element,
+ * inf is not needed due to a better initialization routine.  This
+ * is a well-performing variant.
+ *
+ * @param T the element type
+ * @param Comparator the comparator to use, defaults to std::less<T>
+ */
+template<typename T, typename Comparator>
+class LoserTreeBase
+{
+protected:
+  /** @brief Internal representation of a LoserTree element. */
+  struct Loser
   {
-  private:
-    struct Loser
-    {
-      // The relevant element.
-      T key;
-
-      // Is this an infimum or supremum element?
-      bool inf, sup;
-
-      // Number of the sequence the element comes from.
-      int source;
-    };
-
-    unsigned int size, offset;
-    Loser* losers;
-    Comparator comp;
-
-  public:
-    LoserTreeExplicit(unsigned int _size, Comparator _comp = std::less<T>())
-    : comp(_comp)
-    {
-      size = _size;
-      offset = size;
-      losers = new Loser[size];
-      for (unsigned int l = 0; l < size; ++l)
-        {
-          //losers[l].key = ...        stays unset
-          losers[l].inf = true;
-          losers[l].sup = false;
-          //losers[l].source = -1;     //sentinel
-        }
-    }
-
-    ~LoserTreeExplicit()
-    { delete[] losers; }
+    /** @brief flag, true iff this is a "maximum" sentinel. */
+    bool sup;
+    /** @brief index of the source sequence. */
+    int source;
+    /** @brief key of the element in the LoserTree. */
+    T key;
+  };
 
-    int
-    get_min_source()
-    { return losers[0].source; }
+  unsigned int ik, k, offset;
+
+  /** log_2{k} */
+  unsigned int _M_log_k;
+
+  /** @brief LoserTree elements. */
+  Loser* losers;
+
+  /** @brief Comparator to use. */
+  Comparator comp;
+
+  /**
+   * @brief State flag that determines whether the LoserTree is empty.
+   *
+   * Only used for building the LoserTree.
+   */
+  bool first_insert;
+
+public:
+  /**
+   * @brief The constructor.
+   *
+   * @param _k The number of sequences to merge.
+   * @param _comp The comparator to use.
+   */
+  LoserTreeBase(unsigned int _k, Comparator _comp)
+  : comp(_comp)
+  {
+    ik = _k;
+
+    // Compute log_2{k} for the Loser Tree
+    _M_log_k = log2(ik - 1) + 1;
+
+    // Next greater power of 2.
+    k = 1 << _M_log_k;
+    offset = k;
+
+    // Avoid default-constructing losers[].key
+    losers = static_cast<Loser*>(::operator new(2 * k * sizeof(Loser)));
+    for (unsigned int i = ik - 1; i < k; ++i)
+      losers[i + k].sup = true;
+
+    first_insert = true;
+  }
+
+  /**
+   * @brief The destructor.
+   */
+  ~LoserTreeBase()
+  { ::operator delete(losers); }
+
+  /**
+   * @brief Initializes the sequence "source" with the element "key".
+   *
+   * @param key the element to insert
+   * @param source index of the source sequence
+   * @param sup flag that determines whether the value to insert is an
+   *   explicit supremum.
+   */
+  inline void
+  insert_start(const T& key, int source, bool sup)
+  {
+    unsigned int pos = k + source;
+
+    if(first_insert)
+      {
+        // Construct all keys, so we can easily deconstruct them.
+        for (unsigned int i = 0; i < (2 * k); ++i)
+          new(&(losers[i].key)) T(key);
+        first_insert = false;
+      }
+    else
+      new(&(losers[pos].key)) T(key);
+
+    losers[pos].sup = sup;
+    losers[pos].source = source;
+  }
+
+  /**
+   * @return the index of the sequence with the smallest element.
+   */
+  int get_min_source()
+  { return losers[0].source; }
+};
+
+/**
+ * @brief Stable LoserTree variant.
+ *
+ * Provides the stable implementations of insert_start, init_winner,
+ * init and delete_min_insert.
+ *
+ * Unstable variant is done using partial specialisation below.
+ */
+template<bool stable/* default == true */, typename T, typename Comparator>
+class LoserTree : public LoserTreeBase<T, Comparator>
+{
+  typedef LoserTreeBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
+  using Base::first_insert;
+
+public:
+  LoserTree(unsigned int _k, Comparator _comp)
+  : Base::LoserTreeBase(_k, _comp)
+  {}
+
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (losers[right].sup
+            || (!losers[left].sup
+              && !comp(losers[right].key, losers[left].key)))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  void init()
+  { losers[0] = losers[init_winner(1)]; }
+
+  /**
+   * @brief Delete the smallest element and insert a new element from
+   *   the previously smallest element's sequence.
+   *
+   * This implementation is stable.
+   */
+  // Do not pass a const reference since key will be used as local variable.
+  void delete_min_insert(T key, bool sup)
+  {
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted, ties are broken by source.
+        if ((sup && (!losers[pos].sup || losers[pos].source < source))
+              || (!sup && !losers[pos].sup
+                && ((comp(losers[pos].key, key))
+                  || (!comp(key, losers[pos].key)
+                    && losers[pos].source < source))))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].sup, sup);
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].key, key);
+          }
+      }
+
+    losers[0].sup = sup;
+    losers[0].source = source;
+    losers[0].key = key;
+  }
+};
+
+/**
+ * @brief Unstable LoserTree variant.
+ *
+ * Stability (non-stable here) is selected with partial specialization.
+ */
+template<typename T, typename Comparator>
+class LoserTree</* stable == */false, T, Comparator> :
+    public LoserTreeBase<T, Comparator>
+{
+  typedef LoserTreeBase<T, Comparator> Base;
+  using Base::_M_log_k;
+  using Base::k;
+  using Base::losers;
+  using Base::first_insert;
+
+public:
+  LoserTree(unsigned int _k, Comparator _comp)
+  : Base::LoserTreeBase(_k, _comp)
+  {}
+
+  /**
+   * Computes the winner of the competition at position "root".
+   *
+   * Called recursively (starting at 0) to build the initial tree.
+   *
+   * @param root index of the "game" to start.
+   */
+  unsigned int
+  init_winner (unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (losers[right].sup ||
+            (!losers[left].sup
+              && !comp(losers[right].key, losers[left].key)))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  inline void
+  init()
+  { losers[0] = losers[init_winner(1)]; }
+
+  /**
+   * Delete the key smallest element and insert the element key instead.
+   *
+   * @param key the key to insert
+   * @param sup true iff key is an explicitly marked supremum
+   */
+  // Do not pass a const reference since key will be used as local variable.
+  inline void
+  delete_min_insert(T key, bool sup)
+  {
+#if _GLIBCXX_ASSERTIONS
+    // loser trees are only used for at least 2 sequences
+    _GLIBCXX_PARALLEL_ASSERT(_M_log_k > 1);
+#endif
 
-    void
-    insert_start(T key, int source, bool sup)
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
     {
-      bool inf = false;
-      for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-        {
-          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
-               && comp(losers[pos].key, key)) || losers[pos].inf || sup)
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].key, key);
-              std::swap(losers[pos].inf, inf);
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-            }
-        }
-
-      losers[0].key = key;
-      losers[0].inf = inf;
-      losers[0].sup = sup;
-      losers[0].source = source;
+        // The smaller one gets promoted.
+      if (sup || (!losers[pos].sup && comp(losers[pos].key, key)))
+      {
+            // The other one is smaller.
+        std::swap(losers[pos].sup, sup);
+        std::swap(losers[pos].source, source);
+        std::swap(losers[pos].key, key);
+      }
     }
 
-    void
-    init() { }
+    losers[0].sup = sup;
+    losers[0].source = source;
+    losers[0].key = key;
+  }
+};
 
-    void
-    delete_min_insert(T key, bool sup)
-    {
-      bool inf = false;
-      int source = losers[0].source;
-      for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
-              && comp(losers[pos].key, key))
-              || losers[pos].inf || sup)
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].key, key);
-              std::swap(losers[pos].inf, inf);
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-            }
-        }
-
-      losers[0].key = key;
-      losers[0].inf = inf;
-      losers[0].sup = sup;
-      losers[0].source = source;
-    }
-
-    void
-    insert_start_stable(T key, int source, bool sup)
-    {
-      bool inf = false;
-      for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-        {
-          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
-              && ((comp(losers[pos].key, key))
-                  || (!comp(key, losers[pos].key)
-                      && losers[pos].source < source)))
-              || losers[pos].inf || sup)
-            {
-              // Take next key.
-              std::swap(losers[pos].key, key);
-              std::swap(losers[pos].inf, inf);
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-            }
-        }
-
-      losers[0].key = key;
-      losers[0].inf = inf;
-      losers[0].sup = sup;
-      losers[0].source = source;
-    }
 
-    void
-    init_stable() { }
-
-    void
-    delete_min_insert_stable(T key, bool sup)
-    {
-      bool inf = false;
-      int source = losers[0].source;
-      for (unsigned int pos = (offset + source) / 2; pos > 0; pos /= 2)
-        {
-          if ((!inf && !losers[pos].inf && !sup && !losers[pos].sup
-              && ((comp(losers[pos].key, key))
-                 || (!comp(key, losers[pos].key)
-                     && losers[pos].source < source)))
-              || losers[pos].inf || sup)
-            {
-              std::swap(losers[pos].key, key);
-              std::swap(losers[pos].inf, inf);
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-            }
-        }
-
-      losers[0].key = key;
-      losers[0].inf = inf;
-      losers[0].sup = sup;
-      losers[0].source = source;
-    }
+/**
+ * @brief Base class of Loser Tree implementation using pointers.
+ */
+template<typename T, typename Comparator>
+class LoserTreePointerBase
+{
+protected:
+  /** @brief Internal representation of LoserTree elements. */
+  struct Loser
+  {
+    bool sup;
+    int source;
+    const T* keyp;
   };
 
-#endif
-
-#if _GLIBCXX_LOSER_TREE
+  unsigned int ik, k, offset;
+  Loser* losers;
+  Comparator comp;
 
-/** @brief Guarded loser tree, either copying the whole element into
-* the tree structure, or looking up the element via the index.
-*
-*  Guarding is done explicitly through one flag sup per element,
-*  inf is not needed due to a better initialization routine.  This
-*  is a well-performing variant.
-*/
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTree
-  {
-  private:
-    struct Loser
-    {
-      bool sup;
-      int source;
-      T key;
-    };
-
-    unsigned int ik, k, offset;
-    Loser* losers;
-    Comparator comp;
-    bool first_insert;
-
-  public:
-    LoserTree(unsigned int _k, Comparator _comp = std::less<T>())
+public:
+  LoserTreePointerBase(unsigned int _k, Comparator _comp = std::less<T>())
     : comp(_comp)
-    {
-      ik = _k;
-
-      // Next greater power of 2.
-      k = 1 << (log2(ik - 1) + 1);
-      offset = k;
-      // Avoid default-constructing losers[].key
-      losers = static_cast<Loser*>(::operator new(2 * k * sizeof(Loser)));
-      for (unsigned int i = ik - 1; i < k; ++i)
-        losers[i + k].sup = true;
-
-      first_insert = true;
-    }
+  {
+    ik = _k;
 
-    ~LoserTree()
-    { ::operator delete(losers); }
+    // Next greater power of 2.
+    k = 1 << (log2(ik - 1) + 1);
+    offset = k;
+    losers = new Loser[k * 2];
+    for (unsigned int i = ik - 1; i < k; i++)
+      losers[i + k].sup = true;
+  }
 
-    int
-    get_min_source()
-    { return losers[0].source; }
+  ~LoserTreePointerBase()
+  { ::operator delete(losers); }
 
-    void
-    insert_start(const T& key, int source, bool sup)
-    {
-      unsigned int pos = k + source;
-
-      if(first_insert)
-        {
-          // Construct all keys, so we can easily deconstruct them.
-          for (unsigned int i = 0; i < (2 * k); ++i)
-            ::new(&(losers[i].key)) T(key);
-          first_insert = false;
-        }
-      else
-        ::new(&(losers[pos].key)) T(key);
-
-      losers[pos].sup = sup;
-      losers[pos].source = source;
-    }
+  int get_min_source()
+  { return losers[0].source; }
 
-    unsigned int
-    init_winner (unsigned int root)
-    {
-      if (root >= k)
-        {
-          return root;
-        }
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if (losers[right].sup
-             || (!losers[left].sup
-                 && !comp(losers[right].key, losers[left].key)))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
+  void insert_start(const T& key, int source, bool sup)
+  {
+    unsigned int pos = k + source;
+
+    losers[pos].sup = sup;
+    losers[pos].source = source;
+    losers[pos].keyp = &key;
+  }
+};
+
+/**
+ * @brief Stable LoserTree implementation.
+ *
+ * The unstable variant is implemented using partial instantiation below.
+ */
+template<bool stable/* default == true */, typename T, typename Comparator>
+class LoserTreePointer : public LoserTreePointerBase<T, Comparator>
+{
+  typedef LoserTreePointerBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
 
-    void
-    init()
-    { losers[0] = losers[init_winner(1)]; }
+public:
+  LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+    : Base::LoserTreePointerBase(_k, _comp)
+  {}
 
-    // Do not pass const reference since key will be used as local variable.
-    void
-    delete_min_insert(T key, bool sup)
-    {
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if (sup || (!losers[pos].sup && comp(losers[pos].key, key)))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].key, key);
-            }
-        }
-
-      losers[0].sup = sup;
-      losers[0].source = source;
-      losers[0].key = key;
-    }
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (losers[right].sup
+            || (!losers[left].sup && !comp(*losers[right].keyp,
+                                          *losers[left].keyp)))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  void init()
+  { losers[0] = losers[init_winner(1)]; }
+
+  void delete_min_insert(const T& key, bool sup)
+  {
+    const T* keyp = &key;
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted, ties are broken by source.
+        if ((sup && (!losers[pos].sup || losers[pos].source < source)) ||
+              (!sup && !losers[pos].sup &&
+              ((comp(*losers[pos].keyp, *keyp)) ||
+                (!comp(*keyp, *losers[pos].keyp)
+                && losers[pos].source < source))))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].sup, sup);
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].keyp, keyp);
+          }
+      }
+
+    losers[0].sup = sup;
+    losers[0].source = source;
+    losers[0].keyp = keyp;
+  }
+};
+
+/**
+ * @brief Unstable LoserTree implementation.
+ *
+ * The stable variant is above.
+ */
+template<typename T, typename Comparator>
+class LoserTreePointer</* stable == */false, T, Comparator> :
+    public LoserTreePointerBase<T, Comparator>
+{
+  typedef LoserTreePointerBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
 
-    void
-    insert_start_stable(const T& key, int source, bool sup)
-    { return insert_start(key, source, sup); }
+public:
+  LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
+    : Base::LoserTreePointerBase(_k, _comp)
+  {}
 
-    unsigned int
-    init_winner_stable (unsigned int root)
-    {
-      if (root >= k)
-        {
-          return root;
-        }
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if (losers[right].sup
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (losers[right].sup
               || (!losers[left].sup
-                 && !comp(losers[right].key, losers[left].key)))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init_stable()
-    { losers[0] = losers[init_winner_stable(1)]; }
-
-    // Do not pass const reference since key will be used as local variable.
-    void
-    delete_min_insert_stable(T key, bool sup)
-    {
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted, ties are broken by source.
-          if ( (sup && (!losers[pos].sup || losers[pos].source < source))
-                || (!sup && !losers[pos].sup
-                  && ((comp(losers[pos].key, key))
-                    || (!comp(key, losers[pos].key)
-                      && losers[pos].source < source))))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].key, key);
-            }
-        }
-
-      losers[0].sup = sup;
-      losers[0].source = source;
-      losers[0].key = key;
-    }
-  };
-
-#endif
-
-#if _GLIBCXX_LOSER_TREE_REFERENCE
-
-/** @brief Guarded loser tree, either copying the whole element into
-* the tree structure, or looking up the element via the index.
-*
-*  Guarding is done explicitly through one flag sup per element,
-*  inf is not needed due to a better initialization routine.  This
-*  is a well-performing variant.
-*/
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTreeReference
+                && !comp(*losers[right].keyp, *losers[left].keyp)))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  void init()
+  { losers[0] = losers[init_winner(1)]; }
+
+  void delete_min_insert(const T& key, bool sup)
   {
-#undef COPY
-#ifdef COPY
-#define KEY(i) losers[i].key
-#define KEY_SOURCE(i) key
-#else
-#define KEY(i) keys[losers[i].source]
-#define KEY_SOURCE(i) keys[i]
-#endif
-  private:
-    struct Loser
-    {
-      bool sup;
-      int source;
-#ifdef COPY
-      T key;
-#endif
-    };
+    const T* keyp = &key;
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted.
+        if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp)))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].sup, sup);
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].keyp, keyp);
+          }
+      }
+
+    losers[0].sup = sup;
+    losers[0].source = source;
+    losers[0].keyp = keyp;
+  }
+};
+
+/** @brief Base class for unguarded LoserTree implementation.
+ * 
+ * The whole element is copied into the tree structure.
+ *
+ * No guarding is done, therefore not a single input sequence must
+ * run empty.  Unused sequence heads are marked with a sentinel which
+ * is &gt; all elements that are to be merged.
+ *
+ * This is a very fast variant.
+ */
+template<typename T, typename Comparator>
+class LoserTreeUnguardedBase
+{
+protected:
+  struct Loser
+  {
+    int source;
+    T key;
+  };
 
-    unsigned int ik, k, offset;
-    Loser* losers;
-#ifndef COPY
-    T* keys;
-#endif
-    Comparator comp;
+  unsigned int ik, k, offset;
+  Loser* losers;
+  Comparator comp;
 
-  public:
-    LoserTreeReference(unsigned int _k, Comparator _comp = std::less<T>())
+public:
+  inline
+  LoserTreeUnguardedBase(unsigned int _k, const T _sentinel,
+                         Comparator _comp = std::less<T>())
     : comp(_comp)
-    {
-      ik = _k;
-
-      // Next greater power of 2.
-      k = 1 << (log2(ik - 1) + 1);
-      offset = k;
-      losers = new Loser[k * 2];
-#ifndef COPY
-      keys = new T[ik];
-#endif
-      for (unsigned int i = ik - 1; i < k; ++i)
-        losers[i + k].sup = true;
-    }
-
-    ~LoserTreeReference()
-    {
-      delete[] losers;
-#ifndef COPY
-      delete[] keys;
-#endif
-    }
-
-    int
-    get_min_source()
-    { return losers[0].source; }
-
-    void
-    insert_start(T key, int source, bool sup)
-    {
-      unsigned int pos = k + source;
-
-      losers[pos].sup = sup;
-      losers[pos].source = source;
-      KEY(pos) = key;
-    }
-
-    unsigned int
-    init_winner(unsigned int root)
-    {
-      if (root >= k)
-        {
-          return root;
-        }
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if ( losers[right].sup ||
-                (!losers[left].sup && !comp(KEY(right), KEY(left))))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init()
-    {
-      losers[0] = losers[init_winner(1)];
-    }
-
-    void
-    delete_min_insert(T key, bool sup)
-    {
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if (sup || (!losers[pos].sup && comp(KEY(pos), KEY_SOURCE(source))))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-#ifdef COPY
-              std::swap(KEY(pos), KEY_SOURCE(source));
-#endif
-            }
-        }
-
-      losers[0].sup = sup;
-      losers[0].source = source;
-#ifdef COPY
-      KEY(0) = KEY_SOURCE(source);
+  {
+    ik = _k;
+
+    // Next greater power of 2.
+    k = 1 << (log2(ik - 1) + 1);
+    offset = k;
+    // Avoid default-constructing losers[].key
+    losers = static_cast<Loser*>(::operator new(2 * k * sizeof(Loser)));
+
+    for (unsigned int i = /*k + ik - 1*/0; i < (2 * k); ++i)
+      {
+        losers[i].key = _sentinel;
+        losers[i].source = -1;
+      }
+  }
+
+  inline ~LoserTreeUnguardedBase()
+  { ::operator delete(losers); }
+
+  inline int
+  get_min_source()
+  {
+    // no dummy sequence can ever be at the top!
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
-    }
+    return losers[0].source;
+  }
 
-    void
-    insert_start_stable(T key, int source, bool sup)
-    { return insert_start(key, source, sup); }
-
-    unsigned int
-    init_winner_stable(unsigned int root)
-    {
-      if (root >= k)
-        {
-          return root;
-        }
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if (losers[right].sup
-              || (!losers[left].sup && !comp(KEY(right), KEY(left))))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init_stable()
-    { losers[0] = losers[init_winner_stable(1)]; }
-
-    void
-    delete_min_insert_stable(T key, bool sup)
-    {
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted, ties are broken by source.
-          if ((sup && (!losers[pos].sup || losers[pos].source < source))
-             || (!sup && !losers[pos].sup
-                 && ((comp(KEY(pos), KEY_SOURCE(source)))
-                     || (!comp(KEY_SOURCE(source), KEY(pos))
-                         && losers[pos].source < source))))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-#ifdef COPY
-              std::swap(KEY(pos), KEY_SOURCE(source));
-#endif
-            }
-        }
+  inline void
+  insert_start(const T& key, int source, bool)
+  {
+    unsigned int pos = k + source;
+
+    new(&(losers[pos].key)) T(key);
+    losers[pos].source = source;
+  }
+};
+
+/**
+ * @brief Stable implementation of unguarded LoserTree.
+ *
+ * Unstable variant is selected below with partial specialization.
+ */
+template<bool stable/* default == true */, typename T, typename Comparator>
+class LoserTreeUnguarded : public LoserTreeUnguardedBase<T, Comparator>
+{
+  typedef LoserTreeUnguardedBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
+
+public:
+  LoserTreeUnguarded(unsigned int _k, const T _sentinel,
+                     Comparator _comp = std::less<T>())
+    : Base::LoserTreeUnguardedBase(_k, _sentinel, _comp)
+  {}
+
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (!comp(losers[right].key, losers[left].key))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  inline void
+  init()
+  {
+    losers[0] = losers[init_winner(1)];
 
-      losers[0].sup = sup;
-      losers[0].source = source;
-#ifdef COPY
-      KEY(0) = KEY_SOURCE(source);
+    // no dummy sequence can ever be at the top at the beginning (0 sequences!)
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
-    }
-  };
-#undef KEY
-#undef KEY_SOURCE
+  }
 
+  // Do not pass a const reference since key will be used as local variable.
+  inline void
+  delete_min_insert(T key, bool)
+  {
+    // No dummy sequence can ever be at the top and be retrieved!
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
 
-#if _GLIBCXX_LOSER_TREE_POINTER
-
-/** @brief Guarded loser tree, either copying the whole element into
-    the tree structure, or looking up the element via the index.
-*  Guarding is done explicitly through one flag sup per element,
-*  inf is not needed due to a better initialization routine.
-*  This is a well-performing variant.
-*/
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTreePointer
+    int source = losers[0].source;
+    printf("%d\n", source);
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted, ties are broken by source.
+        if (comp(losers[pos].key, key)
+              || (!comp(key, losers[pos].key) && losers[pos].source < source))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].key, key);
+          }
+      }
+
+    losers[0].source = source;
+    losers[0].key = key;
+  }
+};
+
+/**
+ * @brief Non-Stable implementation of unguarded LoserTree.
+ *
+ * Stable implementation is above.
+ */
+template<typename T, typename Comparator>
+class LoserTreeUnguarded</* stable == */false, T, Comparator> :
+    public LoserTreeUnguardedBase<T, Comparator>
+{
+  typedef LoserTreeUnguardedBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
+
+public:
+  LoserTreeUnguarded(unsigned int _k, const T _sentinel,
+                     Comparator _comp = std::less<T>())
+    : Base::LoserTreeUnguardedBase(_k, _sentinel, _comp)
+  {}
+
+  unsigned int
+  init_winner (unsigned int root)
   {
-  private:
-    struct Loser
-    {
-      bool sup;
-      int source;
-      const T* keyp;
-    };
-
-    unsigned int ik, k, offset;
-    Loser* losers;
-    Comparator comp;
-
-  public:
-    LoserTreePointer(unsigned int _k, Comparator _comp = std::less<T>())
-      : comp(_comp)
-    {
-      ik = _k;
-
-      // Next greater power of 2.
-      k = 1 << (log2(ik - 1) + 1);
-      offset = k;
-      losers = new Loser[k * 2];
-      for (unsigned int i = ik - 1; i < k; ++i)
-        losers[i + k].sup = true;
-    }
-
-    ~LoserTreePointer()
-    { delete[] losers; }
-
-    int
-    get_min_source()
-    { return losers[0].source; }
-
-    void
-    insert_start(const T& key, int source, bool sup)
-    {
-      unsigned int pos = k + source;
-
-      losers[pos].sup = sup;
-      losers[pos].source = source;
-      losers[pos].keyp = &key;
-    }
-
-    unsigned int
-    init_winner(unsigned int root)
-    {
-      if (root >= k)
-       return root;
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if (losers[right].sup
-                || (!losers[left].sup
-                  && !comp(*losers[right].keyp, *losers[left].keyp)))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init()
-    { losers[0] = losers[init_winner(1)]; }
-
-    void
-    delete_min_insert(const T& key, bool sup)
-    {
-      const T* keyp = &key;
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if (sup || (!losers[pos].sup && comp(*losers[pos].keyp, *keyp)))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].keyp, keyp);
-            }
-        }
-
-      losers[0].sup = sup;
-      losers[0].source = source;
-      losers[0].keyp = keyp;
-    }
-
-    void
-    insert_start_stable(const T& key, int source, bool sup)
-    { return insert_start(key, source, sup); }
-
-    unsigned int
-    init_winner_stable(unsigned int root)
-    {
-      if (root >= k)
-        {
-          return root;
-        }
-      else
-        {
-          unsigned int left = init_winner (2 * root);
-          unsigned int right = init_winner (2 * root + 1);
-          if (losers[right].sup
-              || (!losers[left].sup && !comp(*losers[right].keyp,
-                                            *losers[left].keyp)))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init_stable()
-    { losers[0] = losers[init_winner_stable(1)]; }
-
-    void
-    delete_min_insert_stable(const T& key, bool sup)
-    {
-      const T* keyp = &key;
-      int source = losers[0].source;
-      for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted, ties are broken by source.
-          if ( (sup && (!losers[pos].sup || losers[pos].source < source))
-               || (!sup && !losers[pos].sup &&
-                   ((comp(*losers[pos].keyp, *keyp))
-                    || (!comp(*keyp, *losers[pos].keyp)
-                        && losers[pos].source < source))))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].sup, sup);
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].keyp, keyp);
-            }
-        }
-
-      losers[0].sup = sup;
-      losers[0].source = source;
-      losers[0].keyp = keyp;
-    }
-  };
-
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+
+#if _GLIBCXX_ASSERTIONS
+        // If left one is sentinel then right one must be, too.
+        if (losers[left].source == -1)
+          _GLIBCXX_PARALLEL_ASSERT(losers[right].source == -1);
 #endif
 
-#if _GLIBCXX_LOSER_TREE_UNGUARDED
-
-/** @brief Unguarded loser tree, copying the whole element into the
-* tree structure.
-*
-*  No guarding is done, therefore not a single input sequence must
-*  run empty.  This is a very fast variant.
-*/
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTreeUnguarded
+        if (!comp(losers[right].key, losers[left].key))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  inline void
+  init()
   {
-  private:
-    struct Loser
-    {
-      int source;
-      T key;
-    };
-
-    unsigned int ik, k, offset;
-    unsigned int* mapping;
-    Loser* losers;
-    Comparator comp;
-
-    void
-    map(unsigned int root, unsigned int begin, unsigned int end)
-    {
-      if (begin + 1 == end)
-        mapping[begin] = root;
-      else
-        {
-          // Next greater or equal power of 2.
-          unsigned int left = 1 << (log2(end - begin - 1));
-          map(root * 2, begin, begin + left);
-          map(root * 2 + 1, begin + left, end);
-        }
-    }
-
-  public:
-    LoserTreeUnguarded(unsigned int _k, Comparator _comp = std::less<T>())
-    : comp(_comp)
-    {
-      ik = _k;
-      // Next greater or equal power of 2.
-      k = 1 << (log2(ik - 1) + 1);
-      offset = k;
-      losers = new Loser[k + ik];
-      mapping = new unsigned int[ik];
-      map(1, 0, ik);
-    }
-
-    ~LoserTreeUnguarded()
-    {
-      delete[] losers;
-      delete[] mapping;
-    }
-
-    int
-    get_min_source()
-    { return losers[0].source; }
-
-    void
-    insert_start(const T& key, int source, bool)
-    {
-      unsigned int pos = mapping[source];
-      losers[pos].source = source;
-      losers[pos].key = key;
-    }
-
-    unsigned int
-    init_winner(unsigned int root, unsigned int begin, unsigned int end)
-    {
-      if (begin + 1 == end)
-        return mapping[begin];
-      else
-        {
-          // Next greater or equal power of 2.
-          unsigned int division = 1 << (log2(end - begin - 1));
-          unsigned int left = init_winner(2 * root, begin, begin + division);
-          unsigned int right =
-                          init_winner(2 * root + 1, begin + division, end);
-          if (!comp(losers[right].key, losers[left].key))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
-
-    void
-    init()
-    { losers[0] = losers[init_winner(1, 0, ik)]; }
-
-    // Do not pass const reference since key will be used as local variable.
-    void
-    delete_min_insert(const T& key, bool)
-    {
-      losers[0].key = key;
-      T& keyr = losers[0].key;
-      int& source = losers[0].source;
-      for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if (comp(losers[pos].key, keyr))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].key, keyr);
-            }
-        }
-    }
-
-    void
-    insert_start_stable(const T& key, int source, bool)
-    { return insert_start(key, source, false); }
-
-    void
-    init_stable()
-    { init(); }
-
-    void
-    delete_min_insert_stable(const T& key, bool)
-    {
-      losers[0].key = key;
-      T& keyr = losers[0].key;
-      int& source = losers[0].source;
-      for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted, ties are broken by source.
-          if (comp(losers[pos].key, keyr)
-              || (!comp(keyr, losers[pos].key)
-                && losers[pos].source < source))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].key, keyr);
-            }
-        }
-    }
-  };
+    losers[0] = losers[init_winner(1)];
 
+    // no dummy sequence can ever be at the top at the beginning (0 sequences!)
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
+  }
 
-#if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
+  // Do not pass a const reference since key will be used as local variable.
+  inline void
+  delete_min_insert(T key, bool)
+  {
+    printf("wrong\n");
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted.
+        if (comp(losers[pos].key, key))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].key, key);
+          }
+      }
+
+    losers[0].source = source;
+    losers[0].key = key;
+  }
+};
 
 /** @brief Unguarded loser tree, keeping only pointers to the
 * elements in the tree structure.
@@ -926,175 +773,233 @@ template<typename T, typename Comparator = std::less<T> >
 *  No guarding is done, therefore not a single input sequence must
 *  run empty.  This is a very fast variant.
 */
-template<typename T, typename Comparator = std::less<T> >
-  class LoserTreePointerUnguarded
+template<typename T, typename Comparator>
+class LoserTreePointerUnguardedBase
+{
+protected:
+  struct Loser
   {
-  private:
-    struct Loser
-    {
-      int source;
-      const T* keyp;
-    };
-
-    unsigned int ik, k, offset;
-    unsigned int* mapping;
-    Loser* losers;
-    Comparator comp;
-
-    void map(unsigned int root, unsigned int begin, unsigned int end)
-    {
-      if (begin + 1 == end)
-        mapping[begin] = root;
-      else
-        {
-          // Next greater or equal power of 2.
-          unsigned int left = 1 << (log2(end - begin - 1));
-          map(root * 2, begin, begin + left);
-          map(root * 2 + 1, begin + left, end);
-        }
-    }
-
-  public:
-    LoserTreePointerUnguarded(unsigned int _k,
-                              Comparator _comp = std::less<T>())
-    : comp(_comp)
-    {
-      ik = _k;
-
-      // Next greater power of 2.
-      k = 1 << (log2(ik - 1) + 1);
-      offset = k;
-      losers = new Loser[k + ik];
-      mapping = new unsigned int[ik];
-      map(1, 0, ik);
-    }
-
-    ~LoserTreePointerUnguarded()
-    {
-      delete[] losers;
-      delete[] mapping;
-    }
-
-    int
-    get_min_source()
-    { return losers[0].source; }
-
-    void
-    insert_start(const T& key, int source, bool)
-    {
-      unsigned int pos = mapping[source];
-      losers[pos].source = source;
-      losers[pos].keyp = &key;
-    }
-
-    unsigned int
-    init_winner(unsigned int root, unsigned int begin, unsigned int end)
-    {
-      if (begin + 1 == end)
-        return mapping[begin];
-      else
-        {
-          // Next greater or equal power of 2.
-          unsigned int division = 1 << (log2(end - begin - 1));
-          unsigned int left = init_winner(2 * root, begin, begin + division);
-          unsigned int right = init_winner(2 * root + 1,
-                                          begin + division, end);
-          if (!comp(*losers[right].keyp, *losers[left].keyp))
-            {
-              // Left one is less or equal.
-              losers[root] = losers[right];
-              return left;
-            }
-          else
-            {
-              // Right one is less.
-              losers[root] = losers[left];
-              return right;
-            }
-        }
-    }
+    int source;
+    const T* keyp;
+  };
 
-    void
-    init()
-    { losers[0] = losers[init_winner(1, 0, ik)]; }
+  unsigned int ik, k, offset;
+  Loser* losers;
+  const T sentinel;
+  Comparator comp;
 
-    void
-    delete_min_insert(const T& key, bool)
-    {
-      const T* keyp = &key;
-      int& source = losers[0].source;
-      for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted.
-          if (comp(*losers[pos].keyp, *keyp))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].keyp, keyp);
-            }
-        }
-
-      losers[0].keyp = keyp;
-    }
+public:
 
-    void
-    insert_start_stable(const T& key, int source, bool)
-    { return insert_start(key, source, false); }
+  inline
+  LoserTreePointerUnguardedBase(unsigned int _k, const T _sentinel,
+      Comparator _comp = std::less<T>())
+    : sentinel(_sentinel), comp(_comp)
+  {
+    ik = _k;
+
+    // Next greater power of 2.
+    k = 1 << (log2(ik - 1) + 1);
+    offset = k;
+    // Avoid default-constructing losers[].key
+    losers = new Loser[2 * k];
+
+    for (unsigned int i = /*k + ik - 1*/0; i < (2 * k); ++i)
+      {
+        losers[i].keyp = &sentinel;
+        losers[i].source = -1;
+      }
+  }
+
+  inline ~LoserTreePointerUnguardedBase()
+  { delete[] losers; }
+
+  inline int
+  get_min_source()
+  {
+    // no dummy sequence can ever be at the top!
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
+#endif
+    return losers[0].source;
+  }
 
-    void
-    init_stable()
-    { init(); }
+  inline void
+  insert_start(const T& key, int source, bool)
+  {
+    unsigned int pos = k + source;
+
+    losers[pos].keyp = &key;
+    losers[pos].source = source;
+  }
+};
+
+/**
+ * @brief Stable unguarded LoserTree variant storing pointers.
+ *
+ * Unstable variant is implemented below using partial specialization.
+ */
+template<bool stable/* default == true */, typename T, typename Comparator>
+class LoserTreePointerUnguarded :
+    public LoserTreePointerUnguardedBase<T, Comparator>
+{
+  typedef LoserTreePointerUnguardedBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
+
+public:
+  LoserTreePointerUnguarded(unsigned int _k, const T _sentinel,
+      Comparator _comp = std::less<T>())
+    : Base::LoserTreePointerUnguardedBase(_k, _sentinel, _comp)
+  {}
+
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+        if (!comp(*losers[right].keyp, *losers[left].keyp))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  inline void
+  init()
+  {
+    losers[0] = losers[init_winner(1)];
 
-    void
-    delete_min_insert_stable(const T& key, bool)
-    {
-      int& source = losers[0].source;
-      const T* keyp = &key;
-      for (int pos = mapping[source] / 2; pos > 0; pos /= 2)
-        {
-          // The smaller one gets promoted, ties are broken by source.
-          if (comp(*losers[pos].keyp, *keyp)
-              || (!comp(*keyp, *losers[pos].keyp)
-                  && losers[pos].source < source))
-            {
-              // The other one is smaller.
-              std::swap(losers[pos].source, source);
-              std::swap(losers[pos].keyp, keyp);
-            }
-        }
-      losers[0].keyp = keyp;
-    }
-  };
+    // no dummy sequence can ever be at the top at the beginning (0 sequences!)
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
+  }
 
-template<typename _ValueTp, class Comparator>
-  struct loser_tree_traits
+  inline void
+  delete_min_insert(const T& key, bool sup)
   {
-#if _GLIBCXX_LOSER_TREE
-    typedef LoserTree<_ValueTp, Comparator> LT;
-#else
-#  if _GLIBCXX_LOSER_TREE_POINTER
-    typedef LoserTreePointer<_ValueTp, Comparator> LT;
-#  else
-#    error Must define some type in losertree.h.
-#  endif
+    const T* keyp = &key;
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted, ties are broken by source.
+        if (comp(*losers[pos].keyp, *keyp)
+          || (!comp(*keyp, *losers[pos].keyp) && losers[pos].source < source))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].keyp, keyp);
+          }
+      }
+
+    losers[0].source = source;
+    losers[0].keyp = keyp;
+
+    // no dummy sequence can ever be at the top!
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
+#endif
+  }
+};
+
+/**
+ * @brief Unstable unguarded LoserTree variant storing pointers.
+ *
+ * Stable variant is above.
+ */
+template<typename T, typename Comparator>
+class LoserTreePointerUnguarded</* stable == */false, T, Comparator> :
+    public LoserTreePointerUnguardedBase<T, Comparator>
+{
+  typedef LoserTreePointerUnguardedBase<T, Comparator> Base;
+  using Base::k;
+  using Base::losers;
+
+public:
+  LoserTreePointerUnguarded(unsigned int _k, const T _sentinel,
+      Comparator _comp = std::less<T>())
+    : Base::LoserTreePointerUnguardedBase(_k, _sentinel, _comp)
+  {}
+
+  unsigned int
+  init_winner(unsigned int root)
+  {
+    if (root >= k)
+      {
+        return root;
+      }
+    else
+      {
+        unsigned int left = init_winner (2 * root);
+        unsigned int right = init_winner (2 * root + 1);
+
+#if _GLIBCXX_ASSERTIONS
+        // If left one is sentinel then right one must be, too.
+        if (losers[left].source == -1)
+          _GLIBCXX_PARALLEL_ASSERT(losers[right].source == -1);
 #endif
-  };
 
-template<typename _ValueTp, class Comparator>
-  struct loser_tree_unguarded_traits
+        if (!comp(*losers[right].keyp, *losers[left].keyp))
+          {
+            // Left one is less or equal.
+            losers[root] = losers[right];
+            return left;
+          }
+        else
+          {
+            // Right one is less.
+            losers[root] = losers[left];
+            return right;
+          }
+      }
+  }
+
+  inline void
+  init()
   {
-#if _GLIBCXX_LOSER_TREE_UNGUARDED
-    typedef LoserTreeUnguarded<_ValueTp, Comparator> LT;
-#else
-#  if _GLIBCXX_LOSER_TREE_POINTER_UNGUARDED
-    typedef LoserTreePointerUnguarded<_ValueTp, Comparator> LT;
-#  else
-#    error Must define some unguarded type in losertree.h.
-#  endif
+    losers[0] = losers[init_winner(1)];
+
+    // no dummy sequence can ever be at the top at the beginning (0 sequences!)
+#if _GLIBCXX_ASSERTIONS
+    _GLIBCXX_PARALLEL_ASSERT(losers[0].source != -1);
 #endif
-  };
+  }
 
-}
+  inline void
+  delete_min_insert(const T& key, bool sup)
+  {
+    const T* keyp = &key;
+    int source = losers[0].source;
+    for (unsigned int pos = (k + source) / 2; pos > 0; pos /= 2)
+      {
+        // The smaller one gets promoted.
+        if (comp(*(losers[pos].keyp), *keyp))
+          {
+            // The other one is smaller.
+            std::swap(losers[pos].source, source);
+            std::swap(losers[pos].keyp, keyp);
+          }
+      }
+
+    losers[0].source = source;
+    losers[0].keyp = keyp;
+  }
+};
+
+} // namespace __gnu_parallel
 
 #endif
index f12f31108714e97a1cc9348c6761198f03f1337f..6e0f2e382c3cd81041117daeaeaa47f5dd5cfc5b 100644 (file)
@@ -239,19 +239,26 @@ namespace __gnu_parallel
                           std::iterator_traits<RandomAccessIterator1>::
                           difference_type max_length, Comparator comp)
     {
-      typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-       value_type;
+      typedef typename
+          std::iterator_traits<RandomAccessIterator1>::value_type value_type;
       typedef typename std::iterator_traits<RandomAccessIterator1>::
        difference_type difference_type1 /* == difference_type2 */;
       typedef typename std::iterator_traits<RandomAccessIterator3>::
        difference_type difference_type3;
+      typedef typename std::pair<RandomAccessIterator1, RandomAccessIterator1>
+        iterator_pair;
 
       std::pair<RandomAccessIterator1, RandomAccessIterator1>
        seqs[2] = { std::make_pair(begin1, end1),
                    std::make_pair(begin2, end2) };
-      RandomAccessIterator3 
-       target_end = parallel_multiway_merge(seqs, seqs + 2, target,
-                                            comp, max_length, true, false);
+      RandomAccessIterator3
+        target_end = parallel_multiway_merge
+          < /* stable = */ true, /* sentinels = */ false>(
+            seqs, seqs + 2, target, comp,
+            multiway_merge_exact_splitting
+              < /* stable = */ true, iterator_pair*,
+                Comparator, difference_type1>,
+            max_length);
 
       return target_end;
     }
index 6cc724b6015bf01b073ffa6be9cdd343d83f27c0..40a2f1bc6af55533c5601a2ff3e4b9cc4000f5d4 100644 (file)
@@ -40,7 +40,7 @@
 *  This file is a GNU parallel extension to the Standard C++ Library.
 */
 
-// Written by Johannes Singler.
+// Written by Johannes Singler and Manuel Holtgrewe.
 
 #ifndef _GLIBCXX_PARALLEL_MULTIWAY_MERGE_H
 #define _GLIBCXX_PARALLEL_MULTIWAY_MERGE_H
@@ -50,7 +50,6 @@
 #include <bits/stl_algo.h>
 #include <parallel/features.h>
 #include <parallel/parallel.h>
-#include <parallel/merge.h>
 #include <parallel/losertree.h>
 #if _GLIBCXX_ASSERTIONS
 #include <parallel/checkers.h>
 /** @brief Length of a sequence described by a pair of iterators. */
 #define _GLIBCXX_PARALLEL_LENGTH(s) ((s).second - (s).first)
 
-// XXX need iterator typedefs
 namespace __gnu_parallel
 {
+
+// Announce guarded and unguarded iterator.
+
 template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator;
 
+// Making the arguments const references seems to dangerous,
+// the user-defined comparator might not be const.
 template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+             guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
 template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            guarded_iterator<RandomAccessIterator, Comparator>& bi2);
+              guarded_iterator<RandomAccessIterator, Comparator>& bi2);
 
-  /** @brief Iterator wrapper supporting an implicit supremum at the end
-      of the sequence, dominating all comparisons.
-      *  Deriving from RandomAccessIterator is not possible since
-      *  RandomAccessIterator need not be a class.
-      */
+/** @brief Iterator wrapper supporting an implicit supremum at the end
+ *         of the sequence, dominating all comparisons.
+ *
+ * The implicit supremum comes with a performance cost.
+ *
+ * Deriving from RandomAccessIterator is not possible since
+ * RandomAccessIterator need not be a class.
+ */
 template<typename RandomAccessIterator, typename Comparator>
   class guarded_iterator
   {
@@ -100,7 +106,7 @@ template<typename RandomAccessIterator, typename Comparator>
     *  @param comp Comparator provided for associated overloaded
     *  compare operators. */
     guarded_iterator(RandomAccessIterator begin,
-                    RandomAccessIterator end, Comparator& comp)
+                     RandomAccessIterator end, Comparator& comp)
     : current(begin), end(end), comp(comp)
     { }
 
@@ -115,7 +121,7 @@ template<typename RandomAccessIterator, typename Comparator>
 
     /** @brief Dereference operator.
     *  @return Referenced element. */
-    typename std::iterator_traits<RandomAccessIterator>::value_type
+    typename std::iterator_traits<RandomAccessIterator>::value_type&
     operator*()
     { return *current; }
 
@@ -158,7 +164,7 @@ template<typename RandomAccessIterator, typename Comparator>
 template<typename RandomAccessIterator, typename Comparator>
   inline bool
   operator<=(guarded_iterator<RandomAccessIterator, Comparator>& bi1,
-            guarded_iterator<RandomAccessIterator, Comparator>& bi2)
+               guarded_iterator<RandomAccessIterator, Comparator>& bi2)
   {
     if (bi2.current == bi2.end)        //bi1 is sup
       return bi1.current != bi1.end;   //bi2 is not sup
@@ -185,7 +191,7 @@ template<typename RandomAccessIterator, typename Comparator>
   {
   private:
     /** @brief Current iterator position. */
-    RandomAccessIterator& current;
+    RandomAccessIterator current;
     /** @brief Comparator. */
     mutable Comparator& comp;
 
@@ -195,7 +201,7 @@ template<typename RandomAccessIterator, typename Comparator>
     *  @param end Unused, only for compatibility.
     *  @param comp Unused, only for compatibility. */
     unguarded_iterator(RandomAccessIterator begin,
-                      RandomAccessIterator end, Comparator& comp)
+                       RandomAccessIterator end, Comparator& comp)
     : current(begin), comp(comp)
     { }
 
@@ -210,7 +216,7 @@ template<typename RandomAccessIterator, typename Comparator>
 
     /** @brief Dereference operator.
     *  @return Referenced element. */
-    typename std::iterator_traits<RandomAccessIterator>::value_type
+    typename std::iterator_traits<RandomAccessIterator>::value_type&
     operator*()
     { return *current; }
 
@@ -256,159 +262,41 @@ template<typename RandomAccessIterator, typename Comparator>
     return !(bi1.comp)(*bi2, *bi1);
   }
 
-/** Prepare a set of sequences to be merged without a (end) guard
- *  @param seqs_begin
- *  @param seqs_end
- *  @param comp
- *  @param min_sequence
- *  @param stable
- *  @pre (seqs_end - seqs_begin > 0) */
-template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<
-      typename std::iterator_traits<RandomAccessIteratorIterator>::value_type
-      ::first_type>::difference_type
-  prepare_unguarded(RandomAccessIteratorIterator seqs_begin,
-                    RandomAccessIteratorIterator seqs_end, Comparator comp,
-                    int& min_sequence, bool stable)
-  {
-    _GLIBCXX_CALL(seqs_end - seqs_begin)
-
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
-        ::value_type::first_type
-      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-      value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>
-      ::difference_type
-      difference_type;
-
-    if ((*seqs_begin).first == (*seqs_begin).second)
-      {
-        // Empty sequence found, it's the first one.
-        min_sequence = 0;
-        return -1;
-      }
-
-    // Last element in sequence.
-    value_type min = *((*seqs_begin).second - 1);
-    min_sequence = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin + 1; s != seqs_end; ++s)
-      {
-        if ((*s).first == (*s).second)
-          {
-            // Empty sequence found.
-            min_sequence = static_cast<int>(s - seqs_begin);
-            return -1;
-          }
-
-        // Last element in sequence.
-        const value_type& v = *((*s).second - 1);
-        if (comp(v, min))      //strictly smaller
-          {
-            min = v;
-            min_sequence = static_cast<int>(s - seqs_begin);
-          }
-      }
-
-    difference_type overhang_size = 0;
-
-    int s = 0;
-    for (s = 0; s <= min_sequence; ++s)
-      {
-        RandomAccessIterator1 split;
-        if (stable)
-          split = std::upper_bound(seqs_begin[s].first, seqs_begin[s].second,
-                                  min, comp);
-        else
-          split = std::lower_bound(seqs_begin[s].first, seqs_begin[s].second,
-                                  min, comp);
-
-        overhang_size += seqs_begin[s].second - split;
-      }
-
-    for (; s < (seqs_end - seqs_begin); ++s)
-      {
-        RandomAccessIterator1 split = std::lower_bound(
-            seqs_begin[s].first, seqs_begin[s].second, min, comp);
-        overhang_size += seqs_begin[s].second - split;
-      }
-
-    // So many elements will be left over afterwards.
-    return overhang_size;
-  }
-
-/** Prepare a set of sequences to be merged with a (end) guard (sentinel)
- *  @param seqs_begin
- *  @param seqs_end
- *  @param comp */
-template<typename RandomAccessIteratorIterator, typename Comparator>
-  typename std::iterator_traits<typename std::iterator_traits<
-      RandomAccessIteratorIterator>::value_type::first_type>::difference_type
-  prepare_unguarded_sentinel(RandomAccessIteratorIterator seqs_begin,
-                            RandomAccessIteratorIterator seqs_end,
-                            Comparator comp)
-  {
-    _GLIBCXX_CALL(seqs_end - seqs_begin)
-
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
-      ::value_type::first_type
-      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>
-      ::value_type
-      value_type;
-    typedef typename std::iterator_traits<RandomAccessIterator1>
-      ::difference_type
-      difference_type;
-
-    // Last element in sequence.
-    value_type* max = NULL;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      {
-        if ((*s).first == (*s).second)
-          continue;
-
-        // Last element in sequence.
-        value_type& v = *((*s).second - 1);
-
-        // Strictly greater.
-        if (!max || comp(*max, v))
-          max = &v;
-      }
-
-    difference_type overhang_size = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      {
-        RandomAccessIterator1 split =
-            std::lower_bound((*s).first, (*s).second, *max, comp);
-        overhang_size += (*s).second - split;
-
-        // Set sentinel.
-        *((*s).second) = *max;
-      }
-
-    // So many elements will be left over afterwards.
-    return overhang_size;
-  }
-
 /** @brief Highly efficient 3-way merging procedure.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Unused, stable anyway.
- *  @return End iterator of output sequence. */
+ *
+ * Merging is done with the algorithm implementation described by Peter
+ * Sanders.  Basically, the idea is to minimize the number of necessary
+ * comparison after merging out an element.  The implementation trick
+ * that makes this fast is that the order of the sequences is stored
+ * in the instruction pointer (translated into labels in C++).
+ *
+ * This works well for merging up to 4 sequences.
+ *
+ * Note that making the merging stable does <em>not</em> come at a
+ * performance hit.
+ *
+ * Whether the merging is done guarded or unguarded is selected by the
+ * used iterator class.
+ *
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ *
+ * @return End iterator of output sequence.
+ */
 template<template<typename RAI, typename C> class iterator,
         typename RandomAccessIteratorIterator,
         typename RandomAccessIterator3,
         typename _DifferenceTp,
         typename Comparator>
   RandomAccessIterator3
-  multiway_merge_3_variant(RandomAccessIteratorIterator seqs_begin,
-                          RandomAccessIteratorIterator seqs_end,
-                          RandomAccessIterator3 target,
-                          Comparator comp, _DifferenceTp length,
-                          bool stable)
+  multiway_merge_3_variant(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
   {
     _GLIBCXX_CALL(length);
 
@@ -423,6 +311,10 @@ template<template<typename RAI, typename C> class iterator,
     if (length == 0)
       return target;
 
+#if _GLIBCXX_ASSERTIONS
+    _DifferenceTp orig_length = length;
+#endif
+
     iterator<RandomAccessIterator1, Comparator>
       seq0(seqs_begin[0].first, seqs_begin[0].second, comp),
       seq1(seqs_begin[1].first, seqs_begin[1].second, comp),
@@ -450,17 +342,16 @@ template<template<typename RAI, typename C> class iterator,
         else
           goto s210;
       }
-
-#define _GLIBCXX_PARALLEL_MERGE_3_CASE(a,b,c,c0,c1)\
+#define _GLIBCXX_PARALLEL_MERGE_3_CASE(a,b,c,c0,c1)     \
     s ## a ## b ## c :                                  \
       *target = *seq ## a;                              \
-    ++target;                                           \
-    --length;                                           \
-    ++seq ## a;                                         \
-    if (length == 0) goto finish;                       \
-    if (seq ## a c0 seq ## b) goto s ## a ## b ## c;    \
-    if (seq ## a c1 seq ## c) goto s ## b ## a ## c;    \
-    goto s ## b ## c ## a;
+      ++target;                                         \
+      --length;                                         \
+      ++seq ## a;                                       \
+      if (length == 0) goto finish;                     \
+      if (seq ## a c0 seq ## b) goto s ## a ## b ## c;  \
+      if (seq ## a c1 seq ## c) goto s ## b ## a ## c;  \
+      goto s ## b ## c ## a;
 
     _GLIBCXX_PARALLEL_MERGE_3_CASE(0, 1, 2, <=, <=);
     _GLIBCXX_PARALLEL_MERGE_3_CASE(1, 2, 0, <=, < );
@@ -474,6 +365,14 @@ template<template<typename RAI, typename C> class iterator,
   finish:
     ;
 
+#if _GLIBCXX_ASSERTIONS
+  _GLIBCXX_PARALLEL_ASSERT(
+      ((RandomAccessIterator1)seq0 - seqs_begin[0].first) +
+      ((RandomAccessIterator1)seq1 - seqs_begin[1].first) +
+      ((RandomAccessIterator1)seq2 - seqs_begin[2].first)
+      == orig_length);
+#endif
+
     seqs_begin[0].first = seq0;
     seqs_begin[1].first = seq1;
     seqs_begin[2].first = seq2;
@@ -481,95 +380,31 @@ template<template<typename RAI, typename C> class iterator,
     return target;
   }
 
-template<typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge_3_combined(RandomAccessIteratorIterator seqs_begin,
-                            RandomAccessIteratorIterator seqs_end,
-                            RandomAccessIterator3 target,
-                            Comparator comp,
-                            _DifferenceTp length, bool stable)
-  {
-    _GLIBCXX_CALL(length);
-
-    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
-      ::value_type::first_type
-      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-      value_type;
-
-    int min_seq;
-    RandomAccessIterator3 target_end;
-
-    // Stable anyway.
-    difference_type overhang =
-        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
-
-    difference_type total_length = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      total_length += _GLIBCXX_PARALLEL_LENGTH(*s);
-
-    if (overhang != -1)
-      {
-        difference_type unguarded_length =
-            std::min(length, total_length - overhang);
-        target_end = multiway_merge_3_variant<unguarded_iterator>
-          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-        overhang = length - unguarded_length;
-      }
-    else
-      {
-        // Empty sequence found.
-        overhang = length;
-        target_end = target;
-      }
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length - overhang);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    switch (min_seq)
-      {
-      case 0:
-        // Iterators will be advanced accordingly.
-        target_end = merge_advance(seqs_begin[1].first, seqs_begin[1].second,
-                                  seqs_begin[2].first, seqs_begin[2].second,
-                                  target_end, overhang, comp);
-        break;
-      case 1:
-        target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
-                                  seqs_begin[2].first, seqs_begin[2].second,
-                                  target_end, overhang, comp);
-        break;
-      case 2:
-        target_end = merge_advance(seqs_begin[0].first, seqs_begin[0].second,
-                                  seqs_begin[1].first, seqs_begin[1].second,
-                                  target_end, overhang, comp);
-        break;
-      default:
-        _GLIBCXX_PARALLEL_ASSERT(false);
-      }
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    return target_end;
-  }
-
-/** @brief Highly efficient 4-way merging procedure.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Unused, stable anyway.
- *  @return End iterator of output sequence. */
+/**
+ * @brief Highly efficient 4-way merging procedure.
+ *
+ * Merging is done with the algorithm implementation described by Peter
+ * Sanders. Basically, the idea is to minimize the number of necessary
+ * comparison after merging out an element.  The implementation trick
+ * that makes this fast is that the order of the sequences is stored
+ * in the instruction pointer (translated into goto labels in C++).
+ *
+ * This works well for merging up to 4 sequences.
+ *
+ * Note that making the merging stable does <em>not</em> come at a
+ * performance hit.
+ *
+ * Whether the merging is done guarded or unguarded is selected by the
+ * used iterator class.
+ *
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ *
+ * @return End iterator of output sequence.
+ */
 template<template<typename RAI, typename C> class iterator,
         typename RandomAccessIteratorIterator,
         typename RandomAccessIterator3,
@@ -579,7 +414,7 @@ template<template<typename RAI, typename C> class iterator,
   multiway_merge_4_variant(RandomAccessIteratorIterator seqs_begin,
                            RandomAccessIteratorIterator seqs_end,
                            RandomAccessIterator3 target,
-                           Comparator comp, _DifferenceTp length, bool stable)
+                           Comparator comp, _DifferenceTp length)
   {
     _GLIBCXX_CALL(length);
     typedef _DifferenceTp difference_type;
@@ -676,651 +511,467 @@ template<template<typename RAI, typename C> class iterator,
     return target;
   }
 
-template<typename RandomAccessIteratorIterator,
+/** @brief Multi-way merging procedure for a high branching factor,
+ *         guarded case.
+ *
+ * This merging variant uses a LoserTree class as selected by <tt>LT</tt>.
+ *
+ * Stability is selected through the used LoserTree class <tt>LT</tt>.
+ *
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ *
+ * @return End iterator of output sequence.
+ */
+template<typename LT,
+        typename RandomAccessIteratorIterator,
         typename RandomAccessIterator3,
         typename _DifferenceTp,
         typename Comparator>
   RandomAccessIterator3
-  multiway_merge_4_combined(RandomAccessIteratorIterator seqs_begin,
+  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
                             RandomAccessIteratorIterator seqs_end,
                             RandomAccessIterator3 target,
                             Comparator comp,
-                            _DifferenceTp length, bool stable)
+                            _DifferenceTp length)
   {
-    _GLIBCXX_CALL(length);
-    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(length)
 
+    typedef _DifferenceTp difference_type;
     typedef typename std::iterator_traits<RandomAccessIteratorIterator>
       ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-    int min_seq;
-    RandomAccessIterator3 target_end;
+    int k = static_cast<int>(seqs_end - seqs_begin);
 
-    // Stable anyway.
-    difference_type overhang =
-        prepare_unguarded(seqs_begin, seqs_end, comp, min_seq, true);
+    LT lt(k, comp);
 
     difference_type total_length = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      total_length += _GLIBCXX_PARALLEL_LENGTH(*s);
 
-    if (overhang != -1)
+    // Default value for potentially non-default-constructible types.
+    value_type* arbitrary_element = NULL;
+
+    for (int t = 0; t < k; ++t)
       {
-        difference_type unguarded_length =
-            std::min(length, total_length - overhang);
-        target_end = multiway_merge_4_variant<unguarded_iterator>
-          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-        overhang = length - unguarded_length;
+        if(arbitrary_element == NULL
+          && _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]) > 0)
+          arbitrary_element = &(*seqs_begin[t].first);
+        total_length += _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]);
       }
-    else
+
+    if(total_length == 0)
+      return target;
+
+    for (int t = 0; t < k; ++t)
       {
-        // Empty sequence found.
-        overhang = length;
-        target_end = target;
+        if (seqs_begin[t].first == seqs_begin[t].second)
+          lt.insert_start(*arbitrary_element, t, true);
+        else
+          lt.insert_start(*seqs_begin[t].first, t, false);
       }
 
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length - overhang);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
+    lt.init();
 
-    std::vector<std::pair<RandomAccessIterator1, RandomAccessIterator1> >
-        one_missing(seqs_begin, seqs_end);
-    one_missing.erase(one_missing.begin() + min_seq);  //remove
+    const difference_type const_total_length(std::min(total_length, length));
 
-    target_end = multiway_merge_3_variant<guarded_iterator>(
-        one_missing.begin(), one_missing.end(),
-        target_end, comp, overhang, stable);
+    int source;
 
-    // Insert back again.
-    one_missing.insert(one_missing.begin() + min_seq, seqs_begin[min_seq]);
-    // Write back modified iterators.
-    copy(one_missing.begin(), one_missing.end(), seqs_begin);
+    for (difference_type i = 0; i < const_total_length; ++i)
+      {
+        //take out
+        source = lt.get_min_source();
 
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
+        *(target++) = *(seqs_begin[source].first++);
 
-    return target_end;
+        // Feed.
+        if (seqs_begin[source].first == seqs_begin[source].second)
+          lt.delete_min_insert(*arbitrary_element, true);
+        else
+          // Replace from same source.
+          lt.delete_min_insert(*seqs_begin[source].first, false);
+      }
+
+    return target;
   }
 
-/** @brief Basic multi-way merging procedure.
+/** @brief Multi-way merging procedure for a high branching factor,
+ *         unguarded case.
  *
- *  The head elements are kept in a sorted array, new heads are
- *  inserted linearly.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Stable merging incurs a performance penalty.
- *  @return End iterator of output sequence.
+ * Merging is done using the LoserTree class <tt>LT</tt>.
+ *
+ * Stability is selected by the used LoserTrees.
+ *
+ * @pre No input will run out of elements during the merge.
+ *
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ *
+ * @return End iterator of output sequence.
  */
-template<typename RandomAccessIteratorIterator,
+template<typename LT,
+        typename RandomAccessIteratorIterator,
         typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
+        typename _DifferenceTp, typename Comparator>
   RandomAccessIterator3
-  multiway_merge_bubble(RandomAccessIteratorIterator seqs_begin,
-                        RandomAccessIteratorIterator seqs_end,
-                        RandomAccessIterator3 target,
-                        Comparator comp, _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
+                                      RandomAccessIteratorIterator seqs_end,
+                                      RandomAccessIterator3 target,
+                                      int min_seq, Comparator comp,
+                                      _DifferenceTp length)
   {
     _GLIBCXX_CALL(length)
-
     typedef _DifferenceTp difference_type;
+
     typedef typename std::iterator_traits<RandomAccessIteratorIterator>
       ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-    int k = static_cast<int>(seqs_end - seqs_begin);
-    int nrs;  // Number of remaining sequences.
+    int k = seqs_end - seqs_begin;
+
+    // Determine the sentinel.  The sentinel is largest/last element of the
+    // sequences with the smallest largest/last element.
+    value_type sentinel = *(seqs_begin[min_seq].second - 1);
+
+    LT lt(k, sentinel, comp);
 
-    // Avoid default constructor.
-    value_type* fe = static_cast<value_type*>(
-      ::operator new(sizeof(value_type) * k));  // Front elements.
-    int* source = new int[k];
     difference_type total_length = 0;
 
-    // Write entries into queue.
-    nrs = 0;
-    for (int pi = 0; pi < k; ++pi)
+    for (int t = 0; t < k; ++t)
       {
-        if (seqs_begin[pi].first != seqs_begin[pi].second)
-          {
-            ::new(&(fe[nrs])) value_type(*(seqs_begin[pi].first));
-            source[nrs] = pi;
-            ++nrs;
-            total_length += _GLIBCXX_PARALLEL_LENGTH(seqs_begin[pi]);
-          }
-      }
+#if _GLIBCXX_ASSERTIONS
+        _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second);
+#endif
+        lt.insert_start(*seqs_begin[t].first, t, false);
 
-    if (stable)
-      {
-        // Bubble sort fe and source by fe.
-        for (int k = 0; k < nrs - 1; ++k)
-          for (int pi = nrs - 1; pi > k; --pi)
-            if (comp(fe[pi], fe[pi - 1]) ||
-                (!comp(fe[pi - 1], fe[pi]) && source[pi] < source[pi - 1]))
-              {
-                std::swap(fe[pi - 1], fe[pi]);
-                std::swap(source[pi - 1], source[pi]);
-              }
-      }
-    else
-      {
-        for (int k = 0; k < nrs - 1; ++k)
-          for (int pi = nrs - 1; pi > k; --pi)
-            if (comp(fe[pi], fe[pi-1]))
-              {
-                std::swap(fe[pi-1], fe[pi]);
-                std::swap(source[pi-1], source[pi]);
-              }
+        total_length += _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]);
       }
 
-    // Iterate.
-    if (stable)
-      {
-        int j;
-        while (nrs > 0 && length > 0)
-          {
-            if (source[0] < source[1])
-              {
-                // fe[0] <= fe[1]
-                while ((nrs == 1 || !comp(fe[1], fe[0])) && length > 0)
-                  {
-                    *target = fe[0];
-                    ++target;
-                    ++(seqs_begin[source[0]].first);
-                    --length;
-                    if (seqs_begin[source[0]].first
-                       == seqs_begin[source[0]].second)
-                      {
-                        // Move everything to the left.
-                        for (int s = 0; s < nrs - 1; ++s)
-                          {
-                            fe[s] = fe[s + 1];
-                            source[s] = source[s + 1];
-                          }
-                        fe[nrs - 1].~value_type();  //Destruct explicitly.
-                        --nrs;
-                        break;
-                      }
-                    else
-                      fe[0] = *(seqs_begin[source[0]].first);
-                  }
-              }
-            else
-              {
-                // fe[0] < fe[1]
-                while ((nrs == 1 || comp(fe[0], fe[1])) && length > 0)
-                  {
-                    *target = fe[0];
-                    ++target;
-                    ++(seqs_begin[source[0]].first);
-                    --length;
-                    if (seqs_begin[source[0]].first
-                       == seqs_begin[source[0]].second)
-                      {
-                        for (int s = 0; s < nrs - 1; ++s)
-                          {
-                            fe[s] = fe[s + 1];
-                            source[s] = source[s + 1];
-                          }
-                        fe[nrs - 1].~value_type();  //Destruct explicitly.
-                        --nrs;
-                        break;
-                      }
-                    else
-                      fe[0] = *(seqs_begin[source[0]].first);
-                  }
-              }
-
-            // Sink down.
-            j = 1;
-            while ((j < nrs) && (comp(fe[j], fe[j - 1])
-                                || (!comp(fe[j - 1], fe[j])
-                                    && (source[j] < source[j - 1]))))
-              {
-                std::swap(fe[j - 1], fe[j]);
-                std::swap(source[j - 1], source[j]);
-                ++j;
-              }
-          }
-      }
-    else
+    lt.init();
+
+    // Do not go past end.
+    length = std::min(total_length, length);
+
+    int source;
+
+#if _GLIBCXX_ASSERTIONS
+    difference_type i = 0;
+#endif
+
+    RandomAccessIterator3 target_end = target + length;
+    while (target < target_end)
       {
-        int j;
-        while (nrs > 0 && length > 0)
-          {
-            // fe[0] <= fe[1]
-            while (nrs == 1 || (!comp(fe[1], fe[0])) && length > 0)
-              {
-                *target = fe[0];
-                ++target;
-                ++seqs_begin[source[0]].first;
-                --length;
-                if (seqs_begin[source[0]].first
-                   == seqs_begin[source[0]].second)
-                  {
-                    for (int s = 0; s < (nrs - 1); ++s)
-                      {
-                        fe[s] = fe[s + 1];
-                        source[s] = source[s + 1];
-                      }
-                    fe[nrs - 1].~value_type();  //Destruct explicitly.
-                    --nrs;
-                    break;
-                  }
-                else
-                  fe[0] = *(seqs_begin[source[0]].first);
-              }
-
-            // Sink down.
-            j = 1;
-            while ((j < nrs) && comp(fe[j], fe[j - 1]))
-              {
-                std::swap(fe[j - 1], fe[j]);
-                std::swap(source[j - 1], source[j]);
-                ++j;
-              }
-          }
-      }
+        // Take out.
+        source = lt.get_min_source();
 
-    ::operator delete(fe);  //Destructors already called.
-    delete[] source;
+#if _GLIBCXX_ASSERTIONS
+        _GLIBCXX_PARALLEL_ASSERT(0 <= source && source < k);
+        _GLIBCXX_PARALLEL_ASSERT(i == 0
+            || !comp(*(seqs_begin[source].first), *(target - 1)));
+#endif
 
-    return target;
-  }
+        // Feed.
+        *(target++) = *(seqs_begin[source].first++);
 
-/** @brief Multi-way merging procedure for a high branching factor,
- * guarded case.
- *
- *  The head elements are kept in a loser tree.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *   @param stable Stable merging incurs a performance penalty.
- *  @return End iterator of output sequence.
- */
-template<typename LT,
-        typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge_loser_tree(RandomAccessIteratorIterator seqs_begin,
-                            RandomAccessIteratorIterator seqs_end,
-                            RandomAccessIterator3 target,
-                            Comparator comp,
-                            _DifferenceTp length, bool stable)
-  {
-    _GLIBCXX_CALL(length)
-
-    typedef _DifferenceTp difference_type;
-    typedef typename std::iterator_traits<RandomAccessIteratorIterator>
-      ::value_type::first_type
-      RandomAccessIterator1;
-    typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-      value_type;
-
-    int k = static_cast<int>(seqs_end - seqs_begin);
-
-    LT lt(k, comp);
-
-    difference_type total_length = 0;
-
-    // Default value for potentially non-default-constructible types.
-    value_type* arbitrary_element = NULL;
-
-    for (int t = 0; t < k; ++t)
-      {
-        if(arbitrary_element == NULL
-          && _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]) > 0)
-          arbitrary_element = &(*seqs_begin[t].first);
-        total_length += _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]);
-      }
-
-    if(total_length == 0)
-      return target;
-
-    for (int t = 0; t < k; ++t)
-      {
-        if (stable)
-          {
-            if (seqs_begin[t].first == seqs_begin[t].second)
-              lt.insert_start_stable(*arbitrary_element, t, true);
-            else
-              lt.insert_start_stable(*seqs_begin[t].first, t, false);
-          }
-        else
-          {
-            if (seqs_begin[t].first == seqs_begin[t].second)
-              lt.insert_start(*arbitrary_element, t, true);
-            else
-              lt.insert_start(*seqs_begin[t].first, t, false);
-          }
-      }
-
-    if (stable)
-      lt.init_stable();
-    else
-      lt.init();
-
-    total_length = std::min(total_length, length);
-
-    int source;
-
-    if (stable)
-      {
-        for (difference_type i = 0; i < total_length; ++i)
-          {
-            // Take out.
-            source = lt.get_min_source();
-
-            *(target++) = *(seqs_begin[source].first++);
-
-            // Feed.
-            if (seqs_begin[source].first == seqs_begin[source].second)
-              lt.delete_min_insert_stable(*arbitrary_element, true);
-            else
-              // Replace from same source.
-              lt.delete_min_insert_stable(*seqs_begin[source].first, false);
-
-          }
-      }
-    else
-      {
-        for (difference_type i = 0; i < total_length; ++i)
-          {
-            //take out
-            source = lt.get_min_source();
-
-            *(target++) = *(seqs_begin[source].first++);
-
-            // Feed.
-            if (seqs_begin[source].first == seqs_begin[source].second)
-              lt.delete_min_insert(*arbitrary_element, true);
-            else
-              // Replace from same source.
-              lt.delete_min_insert(*seqs_begin[source].first, false);
-          }
+#if _GLIBCXX_ASSERTIONS
+        _GLIBCXX_PARALLEL_ASSERT(
+            (seqs_begin[source].first != seqs_begin[source].second)
+            || (i >= length - 1));
+        ++i;
+#endif
+        // Replace from same source.
+        lt.delete_min_insert(*seqs_begin[source].first, false);
       }
 
     return target;
   }
 
+
 /** @brief Multi-way merging procedure for a high branching factor,
- * unguarded case.
+ *         requiring sentinels to exist.
+ * @param stable The value must the same as for the used LoserTrees.
+ * @param UnguardedLoserTree Loser Tree variant to use for the unguarded
+ *   merging.
+ * @param GuardedLoserTree Loser Tree variant to use for the guarded
+ *   merging.
  *
- *  The head elements are kept in a loser tree.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Stable merging incurs a performance penalty.
- *  @return End iterator of output sequence.
- *  @pre No input will run out of elements during the merge.
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ *
+ * @return End iterator of output sequence.
  */
-template<typename LT,
-        typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp, typename Comparator>
+template<
+    typename UnguardedLoserTree,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge_loser_tree_unguarded(RandomAccessIteratorIterator seqs_begin,
-                                      RandomAccessIteratorIterator seqs_end,
-                                      RandomAccessIterator3 target,
-                                      Comparator comp,
-                                      _DifferenceTp length, bool stable)
+  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
+                                     RandomAccessIteratorIterator seqs_end,
+                                     RandomAccessIterator3 target,
+                                     Comparator comp,
+                                     _DifferenceTp length)
   {
     _GLIBCXX_CALL(length)
-    typedef _DifferenceTp difference_type;
 
+    typedef _DifferenceTp difference_type;
+    typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
     typedef typename std::iterator_traits<RandomAccessIteratorIterator>
       ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-    int k = seqs_end - seqs_begin;
-
-    LT lt(k, comp);
+    RandomAccessIterator3 target_end;
 
     difference_type total_length = 0;
-
-    for (int t = 0; t < k; ++t)
+    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
       {
-#if _GLIBCXX_ASSERTIONS
-        _GLIBCXX_PARALLEL_ASSERT(seqs_begin[t].first != seqs_begin[t].second);
-#endif
-        if (stable)
-          lt.insert_start_stable(*seqs_begin[t].first, t, false);
-        else
-          lt.insert_start(*seqs_begin[t].first, t, false);
+        total_length += _GLIBCXX_PARALLEL_LENGTH(*s);
 
-        total_length += _GLIBCXX_PARALLEL_LENGTH(seqs_begin[t]);
+        // Move the sequends end behind the sentinel spots.  This has the
+        // effect that the sentinel appears to be within the sequence. Then,
+        // we can use the unguarded variant if we merge out as many
+        // non-sentinel elements as we have.
+        ++((*s).second);
       }
 
-    if (stable)
-      lt.init_stable();
-    else
-      lt.init();
-
-    // Do not go past end.
-    length = std::min(total_length, length);
-
-    int source;
-
-#if _GLIBCXX_ASSERTIONS
-    difference_type i = 0;
-#endif
-
-    if (stable)
-      {
-        RandomAccessIterator3 target_end = target + length;
-        while (target < target_end)
-          {
-            // Take out.
-            source = lt.get_min_source();
-
-#if _GLIBCXX_ASSERTIONS
-            _GLIBCXX_PARALLEL_ASSERT(i == 0
-                || !comp(*(seqs_begin[source].first), *(target - 1)));
-#endif
-
-            *(target++) = *(seqs_begin[source].first++);
+    difference_type unguarded_length =
+         std::min(length, total_length);
+    target_end = multiway_merge_loser_tree_unguarded
+        <UnguardedLoserTree>
+      (seqs_begin, seqs_end, target, 0, comp, unguarded_length);
 
 #if _GLIBCXX_ASSERTIONS
-            _GLIBCXX_PARALLEL_ASSERT(
-                (seqs_begin[source].first != seqs_begin[source].second)
-                || (i == length - 1));
-            ++i;
+    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length);
+    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
 #endif
-            // Feed.
-            // Replace from same source.
-            lt.delete_min_insert_stable(*seqs_begin[source].first, false);
 
-          }
-      }
-    else
-      {
-        RandomAccessIterator3 target_end = target + length;
-        while (target < target_end)
-          {
-            // Take out.
-            source = lt.get_min_source();
+    // Restore the sequence ends so the sentinels are not contained in the
+    // sequence any more (see comment in loop above).
+    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
+      { --((*s).second); }
 
-#if _GLIBCXX_ASSERTIONS
-            if (i > 0 && comp(*(seqs_begin[source].first), *(target - 1)))
-              printf("         %i %i %i\n", length, i, source);
-            _GLIBCXX_PARALLEL_ASSERT(i == 0
-                || !comp(*(seqs_begin[source].first), *(target - 1)));
-#endif
+    return target_end;
+  }
 
-            *(target++) = *(seqs_begin[source].first++);
+/**
+ * @brief Traits for determining whether the loser tree should
+ *   use pointers or copies.
+ *
+ * The field "use_pointer" is used to determine whether to use pointers in
+ * the loser trees or whether to copy the values into the loser tree.
+ *
+ * The default behavior is to use pointers if the data type is 4 times as
+ * big as the pointer to it.
+ *
+ * Specialize for your data type to customize the behavior.
+ *
+ * Example:
+ *
+ *   template<>
+ *   struct loser_tree_traits<int>
+ *   { static const bool use_pointer = false; };
+ *
+ *   template<>
+ *   struct loser_tree_traits<heavyweight_type>
+ *   { static const bool use_pointer = true; };
+ *
+ * @param T type to give the loser tree traits for.
+ */
+template <typename T>
+struct loser_tree_traits
+{
+  /**
+   * @brief True iff to use pointers instead of values in loser trees.
+   *
+   * The default behavior is to use pointers if the data type is four
+   * times as big as the pointer to it.
+   */
+  static const bool use_pointer = (sizeof(T) > 4 * sizeof(T*));
+};
 
-#if _GLIBCXX_ASSERTIONS
-            if (!((seqs_begin[source].first != seqs_begin[source].second)
-                || (i >= length - 1)))
-              printf("         %i %i %i\n", length, i, source);
-            _GLIBCXX_PARALLEL_ASSERT(
-                (seqs_begin[source].first != seqs_begin[source].second)
-                || (i >= length - 1));
-            ++i;
-#endif
-            // Feed.
-            // Replace from same source.
-            lt.delete_min_insert(*seqs_begin[source].first, false);
-          }
-      }
+/**
+ * @brief Switch for 3-way merging with sentinels turned off.
+ *
+ * Note that 3-way merging is always stable!
+ */
+template<
+  bool sentinels /*default == false*/,
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_3_variant_sentinel_switch
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
+  {
+    return multiway_merge_3_variant<guarded_iterator>(
+        seqs_begin, seqs_end, target, comp, length);
+  }
+};
 
-    return target;
+/**
+ * @brief Switch for 3-way merging with sentinels turned on.
+ *
+ * Note that 3-way merging is always stable!
+ */
+template<
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_3_variant_sentinel_switch
+    <true, RandomAccessIteratorIterator, RandomAccessIterator3,
+     _DifferenceTp, Comparator>
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
+  {
+    return multiway_merge_3_variant<unguarded_iterator>(
+        seqs_begin, seqs_end, target, comp, length);
   }
+};
 
-template<typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge_loser_tree_combined(RandomAccessIteratorIterator seqs_begin,
-                                     RandomAccessIteratorIterator seqs_end,
-                                     RandomAccessIterator3 target,
-                                     Comparator comp,
-                                     _DifferenceTp length, bool stable)
+/**
+ * @brief Switch for 4-way merging with sentinels turned off.
+ *
+ * Note that 4-way merging is always stable!
+ */
+template<
+  bool sentinels /*default == false*/,
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_4_variant_sentinel_switch
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
   {
-    _GLIBCXX_CALL(length)
+    return multiway_merge_4_variant<guarded_iterator>(
+        seqs_begin, seqs_end, target, comp, length);
+  }
+};
 
-    typedef _DifferenceTp difference_type;
+/**
+ * @brief Switch for 4-way merging with sentinels turned on.
+ *
+ * Note that 4-way merging is always stable!
+ */
+template<
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_4_variant_sentinel_switch
+    <true, RandomAccessIteratorIterator, RandomAccessIterator3,
+     _DifferenceTp, Comparator>
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
+  {
+    return multiway_merge_4_variant<unguarded_iterator>(
+        seqs_begin, seqs_end, target, comp, length);
+  }
+};
 
+/**
+ * @brief Switch for k-way merging with sentinels turned on.
+ */
+template<
+  bool sentinels,
+  bool stable,
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_k_variant_sentinel_switch
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
+  {
     typedef typename std::iterator_traits<RandomAccessIteratorIterator>
       ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-    int min_seq;
-    RandomAccessIterator3 target_end;
-    difference_type overhang = prepare_unguarded(seqs_begin, seqs_end,
-                                          comp, min_seq, stable);
-
-    difference_type total_length = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      total_length += _GLIBCXX_PARALLEL_LENGTH(*s);
-
-    if (overhang != -1)
-      {
-        difference_type unguarded_length =
-            std::min(length, total_length - overhang);
-        target_end = multiway_merge_loser_tree_unguarded
-          <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
-          (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-        overhang = length - unguarded_length;
-      }
-    else
-      {
-        // Empty sequence found.
-        overhang = length;
-        target_end = target;
-      }
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length - overhang);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    target_end = multiway_merge_loser_tree
-      <typename loser_tree_traits<value_type, Comparator>::LT>
-      (seqs_begin, seqs_end, target_end, comp, overhang, stable);
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    return target_end;
+    return multiway_merge_loser_tree_sentinel<
+        typename __gnu_cxx::__conditional_type<
+            loser_tree_traits<value_type>::use_pointer
+          , LoserTreePointerUnguarded<stable, value_type, Comparator>
+          , LoserTreeUnguarded<stable, value_type, Comparator>
+        >::__type>(seqs_begin, seqs_end, target, comp, length);
   }
+};
 
-template<typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge_loser_tree_sentinel(RandomAccessIteratorIterator seqs_begin,
-                                     RandomAccessIteratorIterator seqs_end,
-                                    RandomAccessIterator3 target,
-                                    Comparator comp,
-                                    _DifferenceTp length, bool stable)
+/**
+ * @brief Switch for k-way merging with sentinels turned off.
+ */
+template<
+  bool stable,
+  typename RandomAccessIteratorIterator,
+  typename RandomAccessIterator3,
+  typename _DifferenceTp,
+  typename Comparator>
+struct multiway_merge_k_variant_sentinel_switch
+    <false, stable, RandomAccessIteratorIterator, RandomAccessIterator3,
+     _DifferenceTp, Comparator>
+{
+  RandomAccessIterator3 operator()(
+      RandomAccessIteratorIterator seqs_begin,
+      RandomAccessIteratorIterator seqs_end,
+      RandomAccessIterator3 target,
+      Comparator comp, _DifferenceTp length)
   {
-    _GLIBCXX_CALL(length)
-
-    typedef _DifferenceTp difference_type;
-    typedef std::iterator_traits<RandomAccessIteratorIterator> traits_type;
     typedef typename std::iterator_traits<RandomAccessIteratorIterator>
       ::value_type::first_type
       RandomAccessIterator1;
     typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
       value_type;
 
-    RandomAccessIterator3 target_end;
-    difference_type overhang =
-        prepare_unguarded_sentinel(seqs_begin, seqs_end, comp);
-
-    difference_type total_length = 0;
-    for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      {
-        total_length += _GLIBCXX_PARALLEL_LENGTH(*s);
-
-        // Sentinel spot.
-        ++((*s).second);
-      }
-
-    difference_type unguarded_length =
-        std::min(length, total_length - overhang);
-    target_end = multiway_merge_loser_tree_unguarded
-      <typename loser_tree_unguarded_traits<value_type, Comparator>::LT>
-      (seqs_begin, seqs_end, target, comp, unguarded_length, stable);
-    overhang = length - unguarded_length;
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length - overhang);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    // Copy rest stable.
-    for (RandomAccessIteratorIterator s = seqs_begin;
-         s != seqs_end && overhang > 0; ++s)
-      {
-        // Restore.
-        --((*s).second);
-        difference_type local_length =
-            std::min<difference_type>(overhang, _GLIBCXX_PARALLEL_LENGTH(*s));
-        target_end = std::copy((*s).first, (*s).first + local_length,
-                               target_end);
-        (*s).first += local_length;
-        overhang -= local_length;
-      }
-
-#if _GLIBCXX_ASSERTIONS
-    _GLIBCXX_PARALLEL_ASSERT(overhang == 0);
-    _GLIBCXX_PARALLEL_ASSERT(target_end == target + length);
-    _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target_end, comp));
-#endif
-
-    return target_end;
+    return multiway_merge_loser_tree<
+        typename __gnu_cxx::__conditional_type<
+            loser_tree_traits<value_type>::use_pointer
+          , LoserTreePointer<stable, value_type, Comparator>
+          , LoserTree<stable, value_type, Comparator>
+        >::__type >(seqs_begin, seqs_end, target, comp, length);
   }
+};
 
 /** @brief Sequential multi-way merging switch.
  *
- *  The _GLIBCXX_PARALLEL_DECISION if based on the branching factor and
+ *  The _GLIBCXX_PARALLEL_DECISION is based on the branching factor and
  *  runtime settings.
  *  @param seqs_begin Begin iterator of iterator pair input sequence.
  *  @param seqs_end End iterator of iterator pair input sequence.
@@ -1330,17 +981,18 @@ template<typename RandomAccessIteratorIterator,
  *  @param stable Stable merging incurs a performance penalty.
  *  @param sentinel The sequences have a sentinel element.
  *  @return End iterator of output sequence. */
-template<typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
+template<
+    bool stable,
+    bool sentinels,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Comparator>
   RandomAccessIterator3
-  multiway_merge(RandomAccessIteratorIterator seqs_begin,
+  sequential_multiway_merge(RandomAccessIteratorIterator seqs_begin,
                  RandomAccessIteratorIterator seqs_end,
                  RandomAccessIterator3 target,
-                 Comparator comp, _DifferenceTp length,
-                 bool stable, bool sentinel,
-                 sequential_tag)
+                 Comparator comp, _DifferenceTp length)
   {
     _GLIBCXX_CALL(length)
 
@@ -1353,17 +1005,14 @@ template<typename RandomAccessIteratorIterator,
 
 #if _GLIBCXX_ASSERTIONS
     for (RandomAccessIteratorIterator s = seqs_begin; s != seqs_end; ++s)
-      _GLIBCXX_PARALLEL_ASSERT(is_sorted((*s).first, (*s).second, comp));
+      {
+        _GLIBCXX_PARALLEL_ASSERT(is_sorted((*s).first, (*s).second, comp));
+      }
 #endif
 
-    RandomAccessIterator3 return_target = target;
+      RandomAccessIterator3 return_target = target;
     int k = static_cast<int>(seqs_end - seqs_begin);
 
-    _MultiwayMergeAlgorithm mwma = _Settings::get().multiway_merge_algorithm;
-
-    if (!sentinel && mwma == LOSER_TREE_SENTINEL)
-      mwma = LOSER_TREE_COMBINED;
-
     switch (k)
       {
       case 0:
@@ -1382,113 +1031,30 @@ template<typename RandomAccessIteratorIterator,
                                       target, length, comp);
         break;
       case 3:
-        switch (mwma)
-          {
-          case LOSER_TREE_COMBINED:
-            return_target = multiway_merge_3_combined(seqs_begin,
-                                                     seqs_end,
-                                                     target,
-                                                     comp, length,
-                                                     stable);
-            break;
-          case LOSER_TREE_SENTINEL:
-            return_target =
-             multiway_merge_3_variant<unguarded_iterator>(seqs_begin,
-                                                          seqs_end,
-                                                          target,
-                                                          comp, length,
-                                                          stable);
-            break;
-          default:
-            return_target = 
-             multiway_merge_3_variant<guarded_iterator>(seqs_begin,
-                                                        seqs_end,
-                                                        target,
-                                                        comp, length,
-                                                        stable);
-            break;
-          }
+        return_target = multiway_merge_3_variant_sentinel_switch<
+            sentinels
+          , RandomAccessIteratorIterator
+          , RandomAccessIterator3
+          , _DifferenceTp
+          , Comparator>()(seqs_begin, seqs_end, target, comp, length);
         break;
       case 4:
-        switch (mwma)
-          {
-          case LOSER_TREE_COMBINED:
-            return_target = multiway_merge_4_combined(seqs_begin,
-                                                     seqs_end,
-                                                     target,
-                                                     comp, length, stable);
-            break;
-          case LOSER_TREE_SENTINEL:
-            return_target = 
-             multiway_merge_4_variant<unguarded_iterator>(seqs_begin,
-                                                          seqs_end,
-                                                          target,
-                                                          comp, length,
-                                                          stable);
-            break;
-          default:
-            return_target = multiway_merge_4_variant<guarded_iterator>(
-             seqs_begin,
-             seqs_end,
-             target,
-             comp, length, stable);
-            break;
-          }
+        return_target = multiway_merge_4_variant_sentinel_switch<
+            sentinels
+          , RandomAccessIteratorIterator
+          , RandomAccessIterator3
+          , _DifferenceTp
+          , Comparator>()(seqs_begin, seqs_end, target, comp, length);
         break;
       default:
-        {
-          switch (mwma)
-            {
-            case BUBBLE:
-              return_target = multiway_merge_bubble(seqs_begin,
-                                                   seqs_end,
-                                                   target,
-                                                   comp, length, stable);
-              break;
-#if _GLIBCXX_LOSER_TREE_EXPLICIT
-            case LOSER_TREE_EXPLICIT:
-              return_target = multiway_merge_loser_tree<
-             LoserTreeExplicit<value_type, Comparator> >(seqs_begin,
-                                                         seqs_end,
-                                                         target,
-                                                         comp, length,
-                                                         stable);
-              break;
-#endif
-#if _GLIBCXX_LOSER_TREE
-            case LOSER_TREE:
-              return_target = multiway_merge_loser_tree<
-                    LoserTree<value_type, Comparator> >(seqs_begin,
-                                                       seqs_end,
-                                                       target,
-                                                       comp, length,
-                                                       stable);
-              break;
-#endif
-#if _GLIBCXX_LOSER_TREE_COMBINED
-            case LOSER_TREE_COMBINED:
-              return_target = multiway_merge_loser_tree_combined(seqs_begin,
-                                                                seqs_end,
-                                                                target,
-                                                                comp, length,
-                                                                stable);
-              break;
-#endif
-#if _GLIBCXX_LOSER_TREE_SENTINEL
-            case LOSER_TREE_SENTINEL:
-              return_target = multiway_merge_loser_tree_sentinel(seqs_begin,
-                                                                seqs_end,
-                                                                target,
-                                                                comp, length,
-                                                                stable);
-              break;
-#endif
-            default:
-              // multiway_merge algorithm not implemented.
-              _GLIBCXX_PARALLEL_ASSERT(0);
-              break;
-            }
-        }
+          return_target = multiway_merge_k_variant_sentinel_switch<
+              sentinels
+            , stable
+            , RandomAccessIteratorIterator
+            , RandomAccessIterator3
+            , _DifferenceTp
+            , Comparator>()(seqs_begin, seqs_end, target, comp, length);
+          break;
       }
 #if _GLIBCXX_ASSERTIONS
     _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1497,38 +1063,246 @@ template<typename RandomAccessIteratorIterator,
     return return_target;
   }
 
+/**
+ * @brief Stable sorting functor.
+ *
+ * Used to reduce code instanciation in multiway_merge_sampling_splitting.
+ */
+template<bool stable, class RandomAccessIterator, class StrictWeakOrdering>
+struct sampling_sorter
+{
+  void operator()(RandomAccessIterator first, RandomAccessIterator last,
+                  StrictWeakOrdering comp)
+  { __gnu_sequential::stable_sort(first, last, comp); }
+};
+
+/**
+ * @brief Non-stable sorting functor.
+ *
+ * Used to reduce code instanciation in multiway_merge_sampling_splitting.
+ */
+template<class RandomAccessIterator, class StrictWeakOrdering>
+struct sampling_sorter<false, RandomAccessIterator, StrictWeakOrdering>
+{
+  void operator()(RandomAccessIterator first, RandomAccessIterator last,
+                  StrictWeakOrdering comp)
+  { __gnu_sequential::sort(first, last, comp); }
+};
+
+/**
+ * @brief Sampling based splitting for parallel multiway-merge routine.
+ */
+template<
+    bool stable
+  , typename RandomAccessIteratorIterator
+  , typename Comparator
+  , typename difference_type>
+void multiway_merge_sampling_splitting(
+    RandomAccessIteratorIterator seqs_begin,
+    RandomAccessIteratorIterator seqs_end,
+    Comparator comp, difference_type length,
+    difference_type total_length,
+    std::vector<std::pair<difference_type, difference_type> > *pieces)
+{
+  typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+    ::value_type::first_type
+    RandomAccessIterator1;
+  typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
+    value_type;
+
+  // k sequences.
+  int k = static_cast<int>(seqs_end - seqs_begin);
+
+  int num_threads = omp_get_num_threads();
+
+  difference_type num_samples =
+      __gnu_parallel::_Settings::get().merge_oversampling * num_threads;
+
+  value_type* samples = static_cast<value_type*>(
+    ::operator new(sizeof(value_type) * k * num_samples));
+  // Sample.
+  for (int s = 0; s < k; ++s)
+    for (difference_type i = 0; i < num_samples; ++i)
+      {
+        difference_type sample_index =
+            static_cast<difference_type>(
+                _GLIBCXX_PARALLEL_LENGTH(seqs_begin[s]) * (double(i + 1) /
+                (num_samples + 1)) * (double(length)
+                / total_length));
+        new(&(samples[s * num_samples + i])) value_type(
+            seqs_begin[s].first[sample_index]);
+      }
+
+  // Sort stable or non-stable, depending on value of template parameter
+  // "stable".
+  sampling_sorter<stable, value_type*, Comparator>()(
+      samples, samples + (num_samples * k), comp);
+
+  for (int slab = 0; slab < num_threads; ++slab)
+    // For each slab / processor.
+    for (int seq = 0; seq < k; ++seq)
+      {
+        // For each sequence.
+        if (slab > 0)
+          pieces[slab][seq].first =
+              std::upper_bound(
+                seqs_begin[seq].first,
+                seqs_begin[seq].second,
+                samples[num_samples * k * slab / num_threads],
+                  comp)
+              - seqs_begin[seq].first;
+        else
+          {
+            // Absolute beginning.
+            pieces[slab][seq].first = 0;
+          }
+        if ((slab + 1) < num_threads)
+          pieces[slab][seq].second =
+              std::upper_bound(
+                  seqs_begin[seq].first,
+                  seqs_begin[seq].second,
+                  samples[num_samples * k * (slab + 1) /
+                      num_threads], comp)
+              - seqs_begin[seq].first;
+        else
+        pieces[slab][seq].second = _GLIBCXX_PARALLEL_LENGTH(seqs_begin[seq]);
+      }
+    ::operator delete(samples);
+}
+
+/**
+ * @brief Exact splitting for parallel multiway-merge routine.
+ */
+template<
+    bool stable
+  , typename RandomAccessIteratorIterator
+  , typename Comparator
+  , typename difference_type>
+void multiway_merge_exact_splitting(
+    RandomAccessIteratorIterator seqs_begin,
+    RandomAccessIteratorIterator seqs_end,
+    Comparator comp,
+    difference_type length,
+    difference_type total_length,
+    std::vector<std::pair<difference_type, difference_type> > *pieces)
+{
+  typedef typename std::iterator_traits<RandomAccessIteratorIterator>
+    ::value_type::first_type
+    RandomAccessIterator1;
+
+  const bool tight = (total_length == length);
+
+  // k sequences.
+  const int k = static_cast<int>(seqs_end - seqs_begin);
+
+  const int num_threads = omp_get_num_threads();
+
+  // (Settings::multiway_merge_splitting == __gnu_parallel::_Settings::EXACT).
+  std::vector<RandomAccessIterator1>* offsets =
+      new std::vector<RandomAccessIterator1>[num_threads];
+  std::vector<
+      std::pair<RandomAccessIterator1, RandomAccessIterator1>
+      > se(k);
+
+  copy(seqs_begin, seqs_end, se.begin());
+
+  difference_type* borders =
+      new difference_type[num_threads + 1];
+  equally_split(length, num_threads, borders);
+
+  for (int s = 0; s < (num_threads - 1); ++s)
+    {
+      offsets[s].resize(k);
+      multiseq_partition(
+          se.begin(), se.end(), borders[s + 1],
+          offsets[s].begin(), comp);
+
+      // Last one also needed and available.
+      if (!tight)
+        {
+          offsets[num_threads - 1].resize(k);
+          multiseq_partition(se.begin(), se.end(),
+                difference_type(length),
+                offsets[num_threads - 1].begin(),  comp);
+        }
+    }
+
+
+  for (int slab = 0; slab < num_threads; ++slab)
+    {
+      // For each slab / processor.
+      for (int seq = 0; seq < k; ++seq)
+        {
+          // For each sequence.
+          if (slab == 0)
+            {
+              // Absolute beginning.
+              pieces[slab][seq].first = 0;
+            }
+          else
+            pieces[slab][seq].first =
+                pieces[slab - 1][seq].second;
+          if (!tight || slab < (num_threads - 1))
+            pieces[slab][seq].second =
+                offsets[slab][seq] - seqs_begin[seq].first;
+          else
+            {
+              // slab == num_threads - 1
+              pieces[slab][seq].second =
+                  _GLIBCXX_PARALLEL_LENGTH(seqs_begin[seq]);
+            }
+        }
+    }
+  delete[] offsets;
+}
+
 /** @brief Parallel multi-way merge routine.
  *
- *  The _GLIBCXX_PARALLEL_DECISION if based on the branching factor
- *  and runtime settings.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Stable merging incurs a performance penalty.
- *  @param sentinel Ignored.
- *  @return End iterator of output sequence.
+ * The _GLIBCXX_PARALLEL_DECISION is based on the branching factor
+ * and runtime settings.
+ *
+ * Must not be called if the number of sequences is 1.
+ *
+ * @param Splitter functor to split input (either exact or sampling based)
+ *
+ * @param seqs_begin Begin iterator of iterator pair input sequence.
+ * @param seqs_end End iterator of iterator pair input sequence.
+ * @param target Begin iterator out output sequence.
+ * @param comp Comparator.
+ * @param length Maximum length to merge.
+ * @param stable Stable merging incurs a performance penalty.
+ * @param sentinel Ignored.
+ * @return End iterator of output sequence.
  */
-template<typename RandomAccessIteratorIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
+template<
+    bool stable,
+    bool sentinels,
+    typename RandomAccessIteratorIterator,
+    typename RandomAccessIterator3,
+    typename _DifferenceTp,
+    typename Splitter,
+    typename Comparator
+    >
   RandomAccessIterator3
   parallel_multiway_merge(RandomAccessIteratorIterator seqs_begin,
                           RandomAccessIteratorIterator seqs_end,
-                           RandomAccessIterator3 target,
-                           Comparator comp,
-                           _DifferenceTp length, bool stable, bool sentinel)
+                          RandomAccessIterator3 target,
+                          Comparator comp,
+                          Splitter splitter,
+                          _DifferenceTp length)
     {
+#if _GLIBCXX_ASSERTIONS
+      _GLIBCXX_PARALLEL_ASSERT(seqs_end - seqs_begin > 1);
+#endif
+
       _GLIBCXX_CALL(length)
 
       typedef _DifferenceTp difference_type;
       typedef typename std::iterator_traits<RandomAccessIteratorIterator>
         ::value_type::first_type
         RandomAccessIterator1;
-      typedef typename std::iterator_traits<RandomAccessIterator1>::value_type
-        value_type;
+      typedef typename
+        std::iterator_traits<RandomAccessIterator1>::value_type value_type;
 
       // k sequences.
       int k = static_cast<int>(seqs_end - seqs_begin);
@@ -1543,13 +1317,10 @@ template<typename RandomAccessIteratorIterator,
       if (total_length == 0 || k == 0)
         return target;
 
-      bool tight = (total_length == length);
-
       std::vector<std::pair<difference_type, difference_type> >* pieces;
 
       thread_index_t num_threads = static_cast<thread_index_t>(
-       std::min<difference_type>(get_max_threads(), total_length));
-      const _Settings& __s = _Settings::get();
+       std::min<difference_type>(get_max_threads(), total_length));
 
 #     pragma omp parallel num_threads (num_threads)
         {
@@ -1562,126 +1333,12 @@ template<typename RandomAccessIteratorIterator,
               for (int s = 0; s < num_threads; ++s)
                 pieces[s].resize(k);
 
-              difference_type num_samples = __s.merge_oversampling 
-                                           * num_threads;
+              difference_type num_samples =
+                  __gnu_parallel::_Settings::get().merge_oversampling *
+                    num_threads;
 
-              if (__s.multiway_merge_splitting == SAMPLING)
-                {
-                  value_type* samples = static_cast<value_type*>(
-                    ::operator new(sizeof(value_type) * k * num_samples));
-                  // Sample.
-                  for (int s = 0; s < k; ++s)
-                    for (difference_type i = 0; i < num_samples; ++i)
-                      {
-                        difference_type sample_index =
-                         static_cast<difference_type>(
-                           _GLIBCXX_PARALLEL_LENGTH(seqs_begin[s])
-                           * (double(i + 1) / (num_samples + 1))
-                           * (double(length) / total_length));
-                        ::new(&(samples[s * num_samples + i]))
-                           value_type(seqs_begin[s].first[sample_index]);
-                      }
-
-                  if (stable)
-                    __gnu_sequential::stable_sort(samples, samples
-                                                 + (num_samples * k), comp);
-                  else
-                    __gnu_sequential::sort(samples, samples
-                                          + (num_samples * k), comp);
-
-                  for (int slab = 0; slab < num_threads; ++slab)
-                    // For each slab / processor.
-                    for (int seq = 0; seq < k; ++seq)
-                      {
-                        // For each sequence.
-                        if (slab > 0)
-                          pieces[slab][seq].first =
-                              std::upper_bound(seqs_begin[seq].first,
-                                              seqs_begin[seq].second,
-                                              samples[num_samples * k
-                                                      * slab / num_threads],
-                                              comp)
-                           - seqs_begin[seq].first;
-                        else
-                          {
-                            // Absolute beginning.
-                            pieces[slab][seq].first = 0;
-                          }
-                        if ((slab + 1) < num_threads)
-                          pieces[slab][seq].second =
-                           std::upper_bound(seqs_begin[seq].first,
-                                            seqs_begin[seq].second,
-                                            samples[num_samples * k
-                                                    * (slab + 1)
-                                                    / num_threads], comp)
-                           - seqs_begin[seq].first;
-                        else
-                         pieces[slab][seq].second 
-                           = _GLIBCXX_PARALLEL_LENGTH(seqs_begin[seq]);
-                      }
-                 ::operator delete(samples);
-                }
-              else
-                {
-                  // (_Settings::multiway_merge_splitting == _Settings::EXACT).
-                  std::vector<RandomAccessIterator1>* offsets =
-                      new std::vector<RandomAccessIterator1>[num_threads];
-                  std::vector<
-                      std::pair<RandomAccessIterator1, RandomAccessIterator1>
-                      > se(k);
-
-                  copy(seqs_begin, seqs_end, se.begin());
-
-                  difference_type* borders =
-                      new difference_type[num_threads + 1];
-                  equally_split(length, num_threads, borders);
-
-                  for (int s = 0; s < (num_threads - 1); ++s)
-                    {
-                      offsets[s].resize(k);
-                      multiseq_partition(
-                          se.begin(), se.end(), borders[s + 1],
-                          offsets[s].begin(), comp);
-
-                      // Last one also needed and available.
-                      if (!tight)
-                        {
-                          offsets[num_threads - 1].resize(k);
-                          multiseq_partition(se.begin(), se.end(),
-                                            difference_type(length),
-                                            offsets[num_threads - 1].begin(),
-                                            comp);
-                        }
-                    }
-
-
-                  for (int slab = 0; slab < num_threads; ++slab)
-                    {
-                      // For each slab / processor.
-                      for (int seq = 0; seq < k; ++seq)
-                        {
-                          // For each sequence.
-                          if (slab == 0)
-                            {
-                              // Absolute beginning.
-                              pieces[slab][seq].first = 0;
-                            }
-                          else
-                            pieces[slab][seq].first =
-                                pieces[slab - 1][seq].second;
-                          if (!tight || slab < (num_threads - 1))
-                            pieces[slab][seq].second =
-                             offsets[slab][seq] - seqs_begin[seq].first;
-                          else
-                            {
-                              // slab == num_threads - 1
-                              pieces[slab][seq].second =
-                               _GLIBCXX_PARALLEL_LENGTH(seqs_begin[seq]);
-                            }
-                        }
-                    }
-                  delete[] offsets;
-                }
+              splitter(seqs_begin, seqs_end, comp, length, total_length,
+                       pieces);
             } //single
 
           thread_index_t iam = omp_get_thread_num();
@@ -1701,15 +1358,14 @@ template<typename RandomAccessIteratorIterator,
               for (int s = 0; s < k; ++s)
                 {
                   chunks[s] = std::make_pair(
-                   seqs_begin[s].first + pieces[iam][s].first,
-                   seqs_begin[s].first + pieces[iam][s].second);
+                  seqs_begin[s].first + pieces[iam][s].first,
+                  seqs_begin[s].first + pieces[iam][s].second);
                   local_length += _GLIBCXX_PARALLEL_LENGTH(chunks[s]);
                 }
 
-              multiway_merge(
+              sequential_multiway_merge<stable, sentinels>(
                     chunks, chunks + k, target + target_position, comp,
-                    std::min(local_length, length - target_position),
-                    stable, false, sequential_tag());
+                    std::min(local_length, length - target_position));
 
               delete[] chunks;
             }
@@ -1727,7 +1383,7 @@ template<typename RandomAccessIteratorIterator,
                            (pieces[iam][1].second - pieces[iam][1].first),
                            comp);
             }
-        } //parallel
+        } // parallel
 
 #if _GLIBCXX_ASSERTIONS
       _GLIBCXX_PARALLEL_ASSERT(is_sorted(target, target + length, comp));
@@ -1743,88 +1399,605 @@ template<typename RandomAccessIteratorIterator,
     }
 
 /**
- *  @brief Multi-way merging front-end.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Stable merging incurs a performance penalty.
- *  @return End iterator of output sequence.
+ * @brief Multiway Merge Frontend.
+ *
+ * Merge the sequences specified by seqs_begin and seqs_end into
+ * target.  seqs_begin and seqs_end must point to a sequence of
+ * pairs.  These pairs must contain an iterator to the beginning
+ * of a sequence in their first entry and an iterator the end of
+ * the same sequence in their second entry.
+ *
+ * Ties are broken arbitrarily.  See stable_multiway_merge for a variant
+ * that breaks ties by sequence number but is slower.
+ *
+ * The first entries of the pairs (i.e. the begin iterators) will be moved
+ * forward.
+ *
+ * The output sequence has to provide enough space for all elements
+ * that are written to it.
+ *
+ * This function will merge the input sequences:
+ *
+ * - not stable
+ * - parallel, depending on the input size and Settings
+ * - using sampling for splitting
+ * - not using sentinels
+ *
+ * Example:
+ *
+ * <pre>
+ *   int sequences[10][10];
+ *   for (int i = 0; i < 10; ++i)
+ *     for (int j = 0; i < 10; ++j)
+ *       sequences[i][j] = j;
+ *   
+ *   int out[33];
+ *   std::vector<std::pair<int*> > seqs;
+ *   for (int i = 0; i < 10; ++i)
+ *     { seqs.push(std::make_pair<int*>(sequences[i], sequences[i] + 10)) }
+ *   
+ *   multiway_merge(seqs.begin(), seqs.end(), target, std::less<int>(), 33);
+ * </pre>
+ *
+ * @see stable_multiway_merge
+ *
+ * @pre All input sequences must be sorted.
+ * @pre Target must provide enough space to merge out length elements or
+ *    the number of elements in all sequences, whichever is smaller.
+ *
+ * @post [target, return value) contains merged elements from the
+ *    input sequences.
+ * @post return value - target = min(length, number of elements in all
+ *    sequences).
+ *
+ * @param RandomAccessIteratorPairIterator iterator over sequence
+ *    of pairs of iterators
+ * @param RandomAccessIteratorOut iterator over target sequence
+ * @param _DifferenceTp difference type for the sequence
+ * @param Comparator strict weak ordering type to compare elements
+ *    in sequences
+ *
+ * @param seqs_begin  begin of sequence sequence
+ * @param seqs_end    end of sequence sequence
+ * @param target      target sequence to merge to.
+ * @param comp        strict weak ordering to use for element comparison.
+ * @param length      the number of elements to merge into target.
+ *
+ * @return end iterator of output sequence
  */
-template<typename RandomAccessIteratorPairIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge(RandomAccessIteratorPairIterator seqs_begin,
-                RandomAccessIteratorPairIterator seqs_end,
-                RandomAccessIterator3 target, Comparator comp,
-                _DifferenceTp length, bool stable)
-  {
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length)
+{
+  typedef _DifferenceTp difference_type;
+  _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+  // catch special case: no sequences
+  if (seqs_begin == seqs_end)
+    return target;
+
+  // Execute merge; maybe parallel, depending on the number of merged
+  // elements and the number of sequences and global thresholds in
+  // Settings.
+  RandomAccessIteratorOut target_end;
+  if ((seqs_end - seqs_begin > 1) &&
+        _GLIBCXX_PARALLEL_CONDITION(
+        ((seqs_end - seqs_begin) >=
+        __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+        && ((sequence_index_t)length >=
+        __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+    target_end = parallel_multiway_merge
+      </* stable = */ false, /* sentinels = */ false>
+        (seqs_begin, seqs_end, target, comp,
+        multiway_merge_sampling_splitting</* stable = */ false,
+          RandomAccessIteratorPairIterator, Comparator, _DifferenceTp>,
+        static_cast<difference_type>(length));
+  else
+    target_end = sequential_multiway_merge
+      </* stable = */false, /* sentinels = */ false>(
+        seqs_begin, seqs_end,
+        target, comp, length);
+
+  return target_end;
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::sequential_tag)
+{
+  typedef _DifferenceTp difference_type;
+  _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+  // catch special case: no sequences
+  if (seqs_begin == seqs_end)
+    return target;
+
+  // Execute multiway merge *sequentially*.
+  return sequential_multiway_merge
+    </* stable = */ false, /* sentinels = */ false>
+      (seqs_begin, seqs_end, target, comp, length);
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::exact_tag)
+{
     typedef _DifferenceTp difference_type;
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
+    // catch special case: no sequences
     if (seqs_begin == seqs_end)
       return target;
 
-    const _Settings& __s = _Settings::get();
-
-    RandomAccessIterator3 target_end;
-    if (_GLIBCXX_PARALLEL_CONDITION(
-        ((seqs_end - seqs_begin) >= __s.multiway_merge_minimal_k)
-        && ((sequence_index_t)length >= __s.multiway_merge_minimal_n)))
-      target_end = parallel_multiway_merge(seqs_begin, seqs_end,
-                                          target, comp,
-                                         static_cast<difference_type>(length),
-                                          stable, false);
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+             __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+                    </* stable = */ false, /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp,
+          multiway_merge_exact_splitting</* stable = */ false,
+            RandomAccessIteratorPairIterator, Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
     else
-      target_end = multiway_merge(seqs_begin, seqs_end, target, comp, length,
-                                 stable, false, sequential_tag());
+      target_end = sequential_multiway_merge
+                      </* stable = */ false, /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp, length);
 
     return target_end;
-  }
+}
 
-/** @brief Multi-way merging front-end.
- *  @param seqs_begin Begin iterator of iterator pair input sequence.
- *  @param seqs_end End iterator of iterator pair input sequence.
- *  @param target Begin iterator out output sequence.
- *  @param comp Comparator.
- *  @param length Maximum length to merge.
- *  @param stable Stable merging incurs a performance penalty.
- *  @return End iterator of output sequence.
- *  @pre For each @c i, @c seqs_begin[i].second must be the end
- *  marker of the sequence, but also reference the one more sentinel
- *  element. */
-template<typename RandomAccessIteratorPairIterator,
-        typename RandomAccessIterator3,
-        typename _DifferenceTp,
-        typename Comparator>
-  RandomAccessIterator3
-  multiway_merge_sentinel(RandomAccessIteratorPairIterator seqs_begin,
-                          RandomAccessIteratorPairIterator seqs_end,
-                          RandomAccessIterator3 target,
-                          Comparator comp,
-                          _DifferenceTp length,
-                          bool stable)
-  {
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length)
+{
     typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
 
+    // catch special case: no sequences
     if (seqs_begin == seqs_end)
       return target;
 
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ true, /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp,
+          multiway_merge_sampling_splitting</* stable = */ true,
+          RandomAccessIteratorPairIterator, Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
+    else
+      target_end = sequential_multiway_merge
+        </* stable = */ true, /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::sequential_tag)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute multiway merge *sequentially*.
+    return sequential_multiway_merge
+      </* stable = */ true, /* sentinels = */ false>
+        (seqs_begin, seqs_end, target, comp, length);
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::exact_tag)
+{
+    typedef _DifferenceTp difference_type;
     _GLIBCXX_CALL(seqs_end - seqs_begin)
 
-    const _Settings& __s = _Settings::get();
-    const bool cond1 = seqs_end - seqs_begin >= __s.multiway_merge_minimal_k;
-    const bool cond2 = sequence_index_t(length) >= __s.multiway_merge_minimal_n;
-    if (_GLIBCXX_PARALLEL_CONDITION(cond1 && cond2))
-      return parallel_multiway_merge(seqs_begin, seqs_end, target, comp, 
-                                    length, stable, true);
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ true, /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp, 
+          multiway_merge_exact_splitting
+            </* stable = */ true, RandomAccessIteratorPairIterator,
+             Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
     else
-      return multiway_merge(seqs_begin, seqs_end, target, comp, length, stable,
-                           true, sequential_tag());
-  }
+      target_end = sequential_multiway_merge</* stable = */ true,
+        /* sentinels = */ false>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
 }
 
+/**
+ * @brief Multiway Merge Frontend.
+ *
+ * Merge the sequences specified by seqs_begin and seqs_end into
+ * target.  seqs_begin and seqs_end must point to a sequence of
+ * pairs.  These pairs must contain an iterator to the beginning
+ * of a sequence in their first entry and an iterator the end of
+ * the same sequence in their second entry.
+ *
+ * Ties are broken arbitrarily.  See stable_multiway_merge for a variant
+ * that breaks ties by sequence number but is slower.
+ *
+ * The first entries of the pairs (i.e. the begin iterators) will be moved
+ * forward.
+ *
+ * The output sequence has to provide enough space for all elements
+ * that are written to it.
+ *
+ * This function will merge the input sequences:
+ *
+ * - not stable
+ * - parallel, depending on the input size and Settings
+ * - using sampling for splitting
+ * - using sentinels
+ *
+ * You have to take care that the element the end iterator points to is
+ * readable and contains a value that is greater than any other non-sentinel
+ * value in all sequences.
+ *
+ * Example:
+ *
+ * <pre>
+ *   int sequences[10][11];
+ *   for (int i = 0; i < 10; ++i)
+ *     for (int j = 0; i < 11; ++j)
+ *       sequences[i][j] = j; // last one is sentinel!
+ *
+ *   int out[33];
+ *   std::vector<std::pair<int*> > seqs;
+ *   for (int i = 0; i < 10; ++i)
+ *     { seqs.push(std::make_pair<int*>(sequences[i], sequences[i] + 10)) }
+ *
+ *   multiway_merge(seqs.begin(), seqs.end(), target, std::less<int>(), 33);
+ * </pre>
+ *
+ * @pre All input sequences must be sorted.
+ * @pre Target must provide enough space to merge out length elements or
+ *    the number of elements in all sequences, whichever is smaller.
+ * @pre For each @c i, @c seqs_begin[i].second must be the end
+ *    marker of the sequence, but also reference the one more sentinel
+ *    element.
+ *
+ * @post [target, return value) contains merged elements from the
+ *    input sequences.
+ * @post return value - target = min(length, number of elements in all
+ *    sequences).
+ *
+ * @see stable_multiway_merge_sentinels
+ *
+ * @param RandomAccessIteratorPairIterator iterator over sequence
+ *    of pairs of iterators
+ * @param RandomAccessIteratorOut iterator over target sequence
+ * @param _DifferenceTp difference type for the sequence
+ * @param Comparator strict weak ordering type to compare elements
+ *    in sequences
+ *
+ * @param seqs_begin  begin of sequence sequence
+ * @param seqs_end    end of sequence sequence
+ * @param target      target sequence to merge to.
+ * @param comp        strict weak ordering to use for element comparison.
+ * @param length      the number of elements to merge into target.
+ *
+ * @return end iterator of output sequence
+ */
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ false, /* sentinels = */ true>
+          (seqs_begin, seqs_end, target, comp,
+          multiway_merge_sampling_splitting
+            </* stable = */ false, RandomAccessIteratorPairIterator,
+             Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
+    else
+      target_end = sequential_multiway_merge
+        </* stable = */false, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::sequential_tag)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute multiway merge *sequentially*.
+    return sequential_multiway_merge
+      </* stable = */ false, /* sentinels = */ true>
+        (seqs_begin, seqs_end, target, comp, length);
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::exact_tag)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ false, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp,
+          multiway_merge_exact_splitting
+            </* stable = */ false, RandomAccessIteratorPairIterator,
+              Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
+    else
+      target_end = sequential_multiway_merge
+        </* stable = */ false, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+            __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ true, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp,
+          multiway_merge_sampling_splitting
+            </* stable = */ true, RandomAccessIteratorPairIterator,
+            Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
+    else
+      target_end = sequential_multiway_merge
+        </* stable = */ true, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::sequential_tag)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute multiway merge *sequentially*.
+    return sequential_multiway_merge
+      </* stable = */ true, /* sentinels = */ true>
+        (seqs_begin, seqs_end, target, comp, length);
+}
+
+template<
+    typename RandomAccessIteratorPairIterator
+  , typename RandomAccessIteratorOut
+  , typename _DifferenceTp
+  , typename Comparator>
+RandomAccessIteratorOut
+stable_multiway_merge_sentinels(RandomAccessIteratorPairIterator seqs_begin
+    , RandomAccessIteratorPairIterator seqs_end
+    , RandomAccessIteratorOut target
+    , Comparator comp, _DifferenceTp length
+    , __gnu_parallel::exact_tag)
+{
+    typedef _DifferenceTp difference_type;
+    _GLIBCXX_CALL(seqs_end - seqs_begin)
+
+    // catch special case: no sequences
+    if (seqs_begin == seqs_end)
+      { return target; }
+
+    // Execute merge; maybe parallel, depending on the number of merged
+    // elements and the number of sequences and global thresholds in
+    // Settings.
+    RandomAccessIteratorOut target_end;
+    if ((seqs_end - seqs_begin > 1) &&
+          _GLIBCXX_PARALLEL_CONDITION(
+          ((seqs_end - seqs_begin) >=
+          __gnu_parallel::_Settings::get().multiway_merge_minimal_k)
+          && ((sequence_index_t)length >=
+          __gnu_parallel::_Settings::get().multiway_merge_minimal_n)))
+      target_end = parallel_multiway_merge
+        </* stable = */ true, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp, 
+          multiway_merge_exact_splitting
+            </* stable = */ true, RandomAccessIteratorPairIterator,
+            Comparator, _DifferenceTp>,
+          static_cast<difference_type>(length));
+    else
+      target_end = sequential_multiway_merge
+        </* stable = */ true, /* sentinels = */ true>(
+          seqs_begin, seqs_end,
+          target, comp, length);
+
+    return target_end;
+}
+
+}; // namespace __gnu_parallel
+
 #endif
index c8ceb2f40b7d42508fdfd656932d8a3b2f0e4a0d..3791a144d53a088895204dd4e20494a3a775b729 100644 (file)
@@ -80,26 +80,9 @@ template<typename RandomAccessIterator>
     /** @brief Start indices, per thread. */
     difference_type* starts;
 
-    /** @brief Temporary arrays for each thread.
-     *
-     *  Indirection Allows using the temporary storage in different
-     *  ways, without code duplication.
-     *  @see _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST */
-    value_type** temporaries;
-
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
     /** @brief Storage in which to sort. */
-    RandomAccessIterator* sorting_places;
+    value_type** temporary;
 
-    /** @brief Storage into which to merge. */
-    value_type** merging_places;
-#else
-    /** @brief Storage in which to sort. */
-    value_type** sorting_places;
-
-    /** @brief Storage into which to merge. */
-    RandomAccessIterator* merging_places;
-#endif
     /** @brief Samples. */
     value_type* samples;
 
@@ -108,9 +91,6 @@ template<typename RandomAccessIterator>
 
     /** @brief Pieces of data to merge @c [thread][sequence] */
     std::vector<Piece<difference_type> >* pieces;
-
-    /** @brief Stable sorting desired. */
-    bool stable;
 };
 
 /**
@@ -122,7 +102,7 @@ template<typename RandomAccessIterator>
 template<typename RandomAccessIterator, typename _DifferenceTp>
   void 
   determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
-                    _DifferenceTp& num_samples)
+                    _DifferenceTp num_samples)
   {
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
@@ -130,8 +110,6 @@ template<typename RandomAccessIterator, typename _DifferenceTp>
 
     thread_index_t iam = omp_get_thread_num();
 
-    num_samples = _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;
-
     difference_type* es = new difference_type[num_samples + 2];
 
     equally_split(sd->starts[iam + 1] - sd->starts[iam], 
@@ -144,11 +122,201 @@ template<typename RandomAccessIterator, typename _DifferenceTp>
     delete[] es;
   }
 
+/** @brief Split consistently. */
+template<bool exact, typename RandomAccessIterator,
+          typename Comparator, typename SortingPlacesIterator>
+  struct split_consistently
+  {
+  };
+
+/** @brief Split by exact splitting. */
+template<typename RandomAccessIterator, typename Comparator,
+          typename SortingPlacesIterator>
+  struct split_consistently
+    <true, RandomAccessIterator, Comparator, SortingPlacesIterator>
+  {
+    void operator()(
+      const thread_index_t iam,
+      PMWMSSortingData<RandomAccessIterator>* sd,
+      Comparator& comp,
+      const typename
+        std::iterator_traits<RandomAccessIterator>::difference_type
+          num_samples)
+      const
+  {
+#   pragma omp barrier
+
+    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seqs(sd->num_threads);
+    for (thread_index_t s = 0; s < sd->num_threads; s++)
+      seqs[s] = std::make_pair(sd->temporary[s],
+                                sd->temporary[s]
+                                    + (sd->starts[s + 1] - sd->starts[s]));
+
+    std::vector<SortingPlacesIterator> offsets(sd->num_threads);
+
+    // if not last thread
+    if (iam < sd->num_threads - 1)
+      multiseq_partition(seqs.begin(), seqs.end(),
+                          sd->starts[iam + 1], offsets.begin(), comp);
+
+    for (int seq = 0; seq < sd->num_threads; seq++)
+      {
+        // for each sequence
+        if (iam < (sd->num_threads - 1))
+          sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
+        else
+          // very end of this sequence
+          sd->pieces[iam][seq].end =
+              sd->starts[seq + 1] - sd->starts[seq];
+      }
+
+#   pragma omp barrier
+
+    for (thread_index_t seq = 0; seq < sd->num_threads; seq++)
+      {
+        // For each sequence.
+        if (iam > 0)
+          sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
+        else
+          // Absolute beginning.
+          sd->pieces[iam][seq].begin = 0;
+      }
+  }   
+  };
+
+/** @brief Split by sampling. */ 
+template<typename RandomAccessIterator, typename Comparator,
+          typename SortingPlacesIterator>
+  struct split_consistently<false, RandomAccessIterator, Comparator,
+                             SortingPlacesIterator>
+  {
+    void operator()(
+        const thread_index_t iam,
+        PMWMSSortingData<RandomAccessIterator>* sd,
+        Comparator& comp,
+        const typename
+          std::iterator_traits<RandomAccessIterator>::difference_type
+            num_samples)
+        const
+    {
+      typedef std::iterator_traits<RandomAccessIterator> traits_type;
+      typedef typename traits_type::value_type value_type;
+      typedef typename traits_type::difference_type difference_type;
+
+      determine_samples(sd, num_samples);
+
+#     pragma omp barrier
+
+#     pragma omp single
+      __gnu_sequential::sort(sd->samples,
+                             sd->samples + (num_samples * sd->num_threads),
+                             comp);
+
+#     pragma omp barrier
+
+      for (thread_index_t s = 0; s < sd->num_threads; ++s)
+        {
+          // For each sequence.
+          if (num_samples * iam > 0)
+            sd->pieces[iam][s].begin =
+                std::lower_bound(sd->temporary[s],
+                    sd->temporary[s]
+                        + (sd->starts[s + 1] - sd->starts[s]),
+                    sd->samples[num_samples * iam],
+                    comp)
+                - sd->temporary[s];
+          else
+            // Absolute beginning.
+            sd->pieces[iam][s].begin = 0;
+
+          if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
+            sd->pieces[iam][s].end =
+                std::lower_bound(sd->temporary[s],
+                        sd->temporary[s]
+                            + (sd->starts[s + 1] - sd->starts[s]),
+                        sd->samples[num_samples * (iam + 1)],
+                        comp)
+                - sd->temporary[s];
+          else
+            // Absolute end.
+            sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
+        }
+    }
+  };
+  
+template<bool stable, typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort
+  {
+  };
+
+template<typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort<true, RandomAccessIterator, Comparator>
+  {
+    void operator()(const RandomAccessIterator& begin,
+                     const RandomAccessIterator& end, Comparator& comp) const
+    {
+      __gnu_sequential::stable_sort(begin, end, comp); 
+    }
+  };
+
+template<typename RandomAccessIterator, typename Comparator>
+  struct possibly_stable_sort<false, RandomAccessIterator, Comparator>
+  {
+    void operator()(const RandomAccessIterator begin,
+                     const RandomAccessIterator end, Comparator& comp) const
+    {
+      __gnu_sequential::sort(begin, end, comp); 
+    }
+  };
+
+template<bool stable, typename SeqRandomAccessIterator,
+          typename RandomAccessIterator, typename Comparator,
+          typename DiffType>
+  struct possibly_stable_multiway_merge
+  {
+  };
+
+template<typename SeqRandomAccessIterator, typename RandomAccessIterator,
+          typename Comparator, typename DiffType>
+  struct possibly_stable_multiway_merge
+    <true, SeqRandomAccessIterator, RandomAccessIterator, Comparator,
+    DiffType>
+  {
+    void operator()(const SeqRandomAccessIterator& seqs_begin,
+                      const SeqRandomAccessIterator& seqs_end,
+                      const RandomAccessIterator& target,
+                      Comparator& comp,
+                      DiffType length_am) const
+    {
+      stable_multiway_merge(seqs_begin, seqs_end, target, comp,
+                       length_am, sequential_tag());
+    }
+  };
+
+template<typename SeqRandomAccessIterator, typename RandomAccessIterator,
+          typename Comparator, typename DiffType>
+  struct possibly_stable_multiway_merge
+    <false, SeqRandomAccessIterator, RandomAccessIterator, Comparator,
+    DiffType>
+  {
+    void operator()(const SeqRandomAccessIterator& seqs_begin,
+                      const SeqRandomAccessIterator& seqs_end,
+                      const RandomAccessIterator& target,
+                      Comparator& comp,
+                      DiffType length_am) const
+    {
+      multiway_merge(seqs_begin, seqs_end, target, comp,
+                       length_am, sequential_tag());
+    }
+  };
+
 /** @brief PMWMS code executed by each thread.
   *  @param sd Pointer to algorithm data.
   *  @param comp Comparator.
   */
-template<typename RandomAccessIterator, typename Comparator>
+template<bool stable, bool exact, typename RandomAccessIterator,
+          typename Comparator>
   void 
   parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
                         Comparator& comp)
@@ -162,165 +330,65 @@ template<typename RandomAccessIterator, typename Comparator>
     // Length of this thread's chunk, before merging.
     difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
 
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    typedef RandomAccessIterator SortingPlacesIterator;
+    // Sort in temporary storage, leave space for sentinel.
 
-    // Sort in input storage.
-    sd->sorting_places[iam] = sd->source + sd->starts[iam];
-#else
     typedef value_type* SortingPlacesIterator;
 
-    // Sort in temporary storage, leave space for sentinel.
-    sd->sorting_places[iam] = sd->temporaries[iam] = 
+    sd->temporary[iam] =
         static_cast<value_type*>(
         ::operator new(sizeof(value_type) * (length_local + 1)));
 
     // Copy there.
     std::uninitialized_copy(sd->source + sd->starts[iam],
                             sd->source + sd->starts[iam] + length_local,
-                            sd->sorting_places[iam]);
-#endif
-
-    // Sort locally.
-    if (sd->stable)
-      __gnu_sequential::stable_sort(sd->sorting_places[iam],
-                                    sd->sorting_places[iam] + length_local,
-                                    comp);
-    else
-      __gnu_sequential::sort(sd->sorting_places[iam],
-                             sd->sorting_places[iam] + length_local,
-                             comp);
-
-    // Invariant: locally sorted subsequence in sd->sorting_places[iam],
-    // sd->sorting_places[iam] + length_local.
-    const _Settings& __s = _Settings::get();
-    if (__s.sort_splitting == SAMPLING)
-      {
-        difference_type num_samples;
-        determine_samples(sd, num_samples);
-
-#       pragma omp barrier
-
-#       pragma omp single
-        __gnu_sequential::sort(sd->samples,
-                               sd->samples + (num_samples * sd->num_threads),
-                               comp);
-
-#       pragma omp barrier
-
-        for (int s = 0; s < sd->num_threads; ++s)
-          {
-            // For each sequence.
-              if (num_samples * iam > 0)
-                sd->pieces[iam][s].begin = 
-                    std::lower_bound(sd->sorting_places[s],
-                        sd->sorting_places[s]
-                            + (sd->starts[s + 1] - sd->starts[s]),
-                        sd->samples[num_samples * iam],
-                        comp)
-                    - sd->sorting_places[s];
-            else
-              // Absolute beginning.
-              sd->pieces[iam][s].begin = 0;
-
-            if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
-              sd->pieces[iam][s].end =
-                  std::lower_bound(sd->sorting_places[s],
-                          sd->sorting_places[s]
-                              + (sd->starts[s + 1] - sd->starts[s]),
-                          sd->samples[num_samples * (iam + 1)],
-                          comp)
-                  - sd->sorting_places[s];
-            else
-              // Absolute end.
-              sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
-            }
-      }
-    else if (__s.sort_splitting == EXACT)
-      {
-#       pragma omp barrier
+                            sd->temporary[iam]);
 
-        std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
-            seqs(sd->num_threads);
-        for (int s = 0; s < sd->num_threads; ++s)
-          seqs[s] = std::make_pair(sd->sorting_places[s],
-                                   sd->sorting_places[s]
-                                       + (sd->starts[s + 1] - sd->starts[s]));
+    possibly_stable_sort<stable, SortingPlacesIterator, Comparator>()
+        (sd->temporary[iam], sd->temporary[iam] + length_local, comp);
 
-        std::vector<SortingPlacesIterator> offsets(sd->num_threads);
+    // Invariant: locally sorted subsequence in sd->temporary[iam],
+    // sd->temporary[iam] + length_local.
 
-        // if not last thread
-        if (iam < sd->num_threads - 1)
-          multiseq_partition(seqs.begin(), seqs.end(),
-                             sd->starts[iam + 1], offsets.begin(), comp);
+    // No barrier here: Synchronization is done by the splitting routine.
 
-        for (int seq = 0; seq < sd->num_threads; ++seq)
-          {
-            // for each sequence
-            if (iam < (sd->num_threads - 1))
-              sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
-            else
-              // very end of this sequence
-              sd->pieces[iam][seq].end = (sd->starts[seq + 1]
-                                         - sd->starts[seq]);
-          }
-
-#       pragma omp barrier
-
-        for (int seq = 0; seq < sd->num_threads; ++seq)
-          {
-            // For each sequence.
-            if (iam > 0)
-              sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
-            else
-              // Absolute beginning.
-              sd->pieces[iam][seq].begin = 0;
-          }
-      }
+    difference_type num_samples =
+        _Settings::get().sort_mwms_oversampling * sd->num_threads - 1;
+    split_consistently
+      <exact, RandomAccessIterator, Comparator, SortingPlacesIterator>()
+        (iam, sd, comp, num_samples);
 
     // Offset from target begin, length after merging.
     difference_type offset = 0, length_am = 0;
-    for (int s = 0; s < sd->num_threads; ++s)
+    for (thread_index_t s = 0; s < sd->num_threads; s++)
       {
         length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
         offset += sd->pieces[iam][s].begin;
       }
 
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    // Merge to temporary storage, uninitialized creation not possible
-    // since there is no multiway_merge calling the placement new
-    // instead of the assignment operator.
-    // XXX incorrect (de)construction
-    sd->merging_places[iam] = sd->temporaries[iam] =
-        static_cast<value_type*>(::operator new(sizeof(value_type)
-                                               * length_am));
-#else
-    // Merge directly to target.
-    sd->merging_places[iam] = sd->source + offset;
-#endif
-    std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
-        seqs(sd->num_threads);
+    typedef std::vector<
+      std::pair<SortingPlacesIterator, SortingPlacesIterator> >
+        seq_vector_type;
+    seq_vector_type seqs(sd->num_threads);
 
     for (int s = 0; s < sd->num_threads; ++s)
       {
         seqs[s] =
-         std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
-                        sd->sorting_places[s] + sd->pieces[iam][s].end);
+          std::make_pair(sd->temporary[s] + sd->pieces[iam][s].begin,
+        sd->temporary[s] + sd->pieces[iam][s].end);
       }
 
-    multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp,
-                   length_am, sd->stable, false, sequential_tag());
+    possibly_stable_multiway_merge<
+        stable,
+        typename seq_vector_type::iterator,
+        RandomAccessIterator,
+        Comparator, difference_type>()
+          (seqs.begin(), seqs.end(),
+           sd->source + offset, comp,
+           length_am);
 
 #   pragma omp barrier
 
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-    // Write back.
-    std::copy(sd->merging_places[iam],
-              sd->merging_places[iam] + length_am,
-              sd->source + offset);
-#endif
-
-    ::operator delete(sd->temporaries[iam]);
+    ::operator delete(sd->temporary[iam]);
   }
 
 /** @brief PMWMS main call.
@@ -329,21 +397,22 @@ template<typename RandomAccessIterator, typename Comparator>
   *  @param comp Comparator.
   *  @param n Length of sequence.
   *  @param num_threads Number of threads to use.
-  *  @param stable Stable sorting.
   */
-template<typename RandomAccessIterator, typename Comparator>
+template<bool stable, bool exact, typename RandomAccessIterator,
+           typename Comparator>
   void
   parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
-                     Comparator comp, typename
-                    std::iterator_traits<RandomAccessIterator>::
-                    difference_type n, int num_threads, bool stable)
+                     Comparator comp,
+                     thread_index_t num_threads)
   {
-    _GLIBCXX_CALL(n)
+    _GLIBCXX_CALL(end - begin)
 
     typedef std::iterator_traits<RandomAccessIterator> traits_type;
     typedef typename traits_type::value_type value_type;
     typedef typename traits_type::difference_type difference_type;
 
+    difference_type n = end - begin;
+
     if (n <= 1)
       return;
 
@@ -354,7 +423,6 @@ template<typename RandomAccessIterator, typename Comparator>
     // shared variables
     PMWMSSortingData<RandomAccessIterator> sd;
     difference_type* starts;
-    const _Settings& __s = _Settings::get();
 
 #   pragma omp parallel num_threads(num_threads)
       {
@@ -364,23 +432,16 @@ template<typename RandomAccessIterator, typename Comparator>
           {
             sd.num_threads = num_threads;
             sd.source = begin;
-            sd.temporaries = new value_type*[num_threads];
-
-#if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
-            sd.sorting_places = new RandomAccessIterator[num_threads];
-            sd.merging_places = new value_type*[num_threads];
-#else
-            sd.sorting_places = new value_type*[num_threads];
-            sd.merging_places = new RandomAccessIterator[num_threads];
-#endif
 
-            if (__s.sort_splitting == SAMPLING)
+            sd.temporary = new value_type*[num_threads];
+
+            if (!exact)
               {
-                unsigned int size = 
-                    (__s.sort_mwms_oversampling * num_threads - 1)
+                difference_type size =
+                    (_Settings::get().sort_mwms_oversampling * num_threads - 1)
                         * num_threads;
                 sd.samples = static_cast<value_type*>(
-                 ::operator new(size * sizeof(value_type)));
+                              ::operator new(size * sizeof(value_type)));
               }
             else
               sd.samples = NULL;
@@ -390,7 +451,6 @@ template<typename RandomAccessIterator, typename Comparator>
             for (int s = 0; s < num_threads; ++s)
               sd.pieces[s].resize(num_threads);
             starts = sd.starts = new difference_type[num_threads + 1];
-            sd.stable = stable;
 
             difference_type chunk_length = n / num_threads;
             difference_type split = n % num_threads;
@@ -401,18 +461,16 @@ template<typename RandomAccessIterator, typename Comparator>
                 pos += (i < split) ? (chunk_length + 1) : chunk_length;
               }
             starts[num_threads] = pos;
-          }
+          } //single
 
         // Now sort in parallel.
-        parallel_sort_mwms_pu(&sd, comp);
+        parallel_sort_mwms_pu<stable, exact>(&sd, comp);
       } //parallel
 
     delete[] starts;
-    delete[] sd.temporaries;
-    delete[] sd.sorting_places;
-    delete[] sd.merging_places;
+    delete[] sd.temporary;
 
-    if (__s.sort_splitting == SAMPLING)
+    if (!exact)
       ::operator delete(sd.samples);
 
     delete[] sd.offsets;
index edf4eea02d8a08a1e0e1e0572d00dedaf243e640..83aa2df1b1108aad1ae226c1affa07f3b4374e10 100644 (file)
@@ -71,7 +71,7 @@ namespace __gnu_parallel
   template<typename RandomAccessIterator, typename Comparator>
     inline void
     parallel_sort(RandomAccessIterator begin, RandomAccessIterator end,
-                 Comparator comp, bool stable)
+                  Comparator comp, bool stable)
     {
       _GLIBCXX_CALL(end - begin)
       typedef std::iterator_traits<RandomAccessIterator> traits_type;
@@ -79,25 +79,43 @@ namespace __gnu_parallel
       typedef typename traits_type::difference_type difference_type;
 
       if (begin != end)
-       {
-         difference_type n = end - begin;
+      {
+        difference_type n = end - begin;
 
-         if (false) ;
+        if (false) ;
 #if _GLIBCXX_MERGESORT
-         else if (stable || _Settings::get().sort_algorithm == MWMS)
-           parallel_sort_mwms(begin, end, comp, n, get_max_threads(), stable);
+        else if (stable)
+          {
+            if(_Settings::get().sort_splitting == EXACT)
+              parallel_sort_mwms<true, true>
+                (begin, end, comp, get_max_threads());
+            else
+              parallel_sort_mwms<true, false>
+                (begin, end, comp, get_max_threads());
+          }
+        else if (_Settings::get().sort_algorithm == MWMS)
+          {
+            if(_Settings::get().sort_splitting == EXACT)
+              parallel_sort_mwms<false, true>
+                (begin, end, comp, get_max_threads());
+            else
+              parallel_sort_mwms<false, false>
+                (begin, end, comp, get_max_threads());
+          }
 #endif
 #if _GLIBCXX_QUICKSORT
-         else if (!stable && _Settings::get().sort_algorithm == QS)
-           parallel_sort_qs(begin, end, comp, n, get_max_threads());
+        else if (!stable && _Settings::get().sort_algorithm == QS)
+          parallel_sort_qs(begin, end, comp, n, get_max_threads());
 #endif
 #if _GLIBCXX_BAL_QUICKSORT
-         else if (!stable && _Settings::get().sort_algorithm == QS_BALANCED)
-           parallel_sort_qsb(begin, end, comp, n, get_max_threads());
+        else if (!stable && _Settings::get().sort_algorithm == QS_BALANCED)
+          parallel_sort_qsb(begin, end, comp, n, get_max_threads());
 #endif
-         else
-           __gnu_sequential::sort(begin, end, comp);
-       }
+        else if(stable)
+          __gnu_sequential::stable_sort(begin, end, comp);
+        else
+          __gnu_sequential::sort(begin, end, comp);
+      }
     }
 } // end namespace __gnu_parallel
 
index b3f2ec86912abaad9deeac2d3674546f18da94fd..f57add97c7b0e18324525b0a26eec2d1f1edc7d3 100644 (file)
@@ -44,6 +44,9 @@ namespace __gnu_parallel
   /** @brief Forces sequential execution at compile time. */
   struct sequential_tag { };
 
+  /** @brief Forces exact splitting in multiway merge at compile time. */
+  struct exact_tag { };
+
   /** @brief Recommends parallel execution at compile time. */
   struct parallel_tag { };
 
index ded617edb6dc99a98e29d0ea5d8d5b3e71d0a209..1b646b02084e82d7bdcd1b769b74cb44bbb1856a 100644 (file)
@@ -87,15 +87,10 @@ namespace __gnu_parallel
   /// Merging algorithms: 
   // bubblesort-alike, loser-tree variants, enum sentinel.
   enum _MultiwayMergeAlgorithm
-    { 
-      BUBBLE, 
-      LOSER_TREE_EXPLICIT, 
-      LOSER_TREE, 
-      LOSER_TREE_COMBINED, 
-      LOSER_TREE_SENTINEL, 
-      ENUM_SENTINEL 
+    {
+      LOSER_TREE
     };
-  
+
   /// Partial sum algorithms: recursive, linear.
   enum _PartialSumAlgorithm 
     { 
diff --git a/libstdc++-v3/testsuite/25_algorithms/sort/35588.cc b/libstdc++-v3/testsuite/25_algorithms/sort/35588.cc
new file mode 100644 (file)
index 0000000..715fa3b
--- /dev/null
@@ -0,0 +1,32 @@
+// Copyright (C) 2008 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 2, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING.  If not, write to the Free
+// Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
+// USA.
+
+#include <algorithm>
+#include <functional>
+#include <tr1/functional>
+
+// libstdc++/35588
+int main()
+{
+  using namespace std;
+  using namespace tr1;
+  using namespace placeholders;
+
+  int t[10];
+  sort(t, t+10, bind(less<int>(), _1, _2));
+}