libstdc++-v3/include/parallel/multiway_mergesort.h

   1 // -*- C++ -*-
   2
   3 // Copyright (C) 2007 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the terms
   7 // of the GNU General Public License as published by the Free Software
   8 // Foundation; either version 2, or (at your option) any later
   9 // version.
  10
  11 // This library is distributed in the hope that it will be useful, but
  12 // WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 // General Public License for more details.
  15
  16 // You should have received a copy of the GNU General Public License
  17 // along with this library; see the file COPYING.  If not, write to
  18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
  19 // MA 02111-1307, USA.
  20
  21 // As a special exception, you may use this file as part of a free
  22 // software library without restriction.  Specifically, if other files
  23 // instantiate templates or use macros or inline functions from this
  24 // file, or you compile this file and link it with other files to
  25 // produce an executable, this file does not by itself cause the
  26 // resulting executable to be covered by the GNU General Public
  27 // License.  This exception does not however invalidate any other
  28 // reasons why the executable file might be covered by the GNU General
  29 // Public License.
  30
  31 /** @file parallel/multiway_mergesort.h
  32  *  @brief Parallel multiway merge sort.
  33  *  This file is a GNU parallel extension to the Standard C++ Library.
  34  */
  35
  36 // Written by Johannes Singler.
  37
  38 #ifndef _GLIBCXX_PARALLEL_MERGESORT_H
  39 #define _GLIBCXX_PARALLEL_MERGESORT_H 1
  40
  41 #include <vector>
  42
  43 #include <parallel/basic_iterator.h>
  44 #include <bits/stl_algo.h>
  45 #include <parallel/parallel.h>
  46 #include <parallel/multiway_merge.h>
  47
  48 namespace __gnu_parallel
  49 {
  50
  51 /** @brief Subsequence description. */
  52 template<typename _DifferenceTp>
  53   struct Piece
  54   {
  55     typedef _DifferenceTp difference_type;
  56
  57     /** @brief Begin of subsequence. */
  58     difference_type begin;
  59
  60     /** @brief End of subsequence. */
  61     difference_type end;
  62   };
  63
  64 /** @brief Data accessed by all threads.
  65   *
  66   *  PMWMS = parallel multiway mergesort */
  67 template<typename RandomAccessIterator>
  68   struct PMWMSSortingData
  69   {
  70     typedef std::iterator_traits<RandomAccessIterator> traits_type;
  71     typedef typename traits_type::value_type value_type;
  72     typedef typename traits_type::difference_type difference_type;
  73
  74     /** @brief Number of threads involved. */
  75     thread_index_t num_threads;
  76
  77     /** @brief Input begin. */
  78     RandomAccessIterator source;
  79
  80     /** @brief Start indices, per thread. */
  81     difference_type* starts;
  82
  83     /** @brief Temporary arrays for each thread.
  84      *
  85      *  Indirection Allows using the temporary storage in different
  86      *  ways, without code duplication.
  87      *  @see _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST */
  88     value_type** temporaries;
  89
  90 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
  91     /** @brief Storage in which to sort. */
  92     RandomAccessIterator* sorting_places;
  93
  94     /** @brief Storage into which to merge. */
  95     value_type** merging_places;
  96 #else
  97     /** @brief Storage in which to sort. */
  98     value_type** sorting_places;
  99
 100     /** @brief Storage into which to merge. */
 101     RandomAccessIterator* merging_places;
 102 #endif
 103     /** @brief Samples. */
 104     value_type* samples;
 105
 106     /** @brief Offsets to add to the found positions. */
 107     difference_type* offsets;
 108
 109     /** @brief Pieces of data to merge @c [thread][sequence] */
 110     std::vector<Piece<difference_type> >* pieces;
 111
 112     /** @brief Stable sorting desired. */
 113     bool stable;
 114 };
 115
 116 /**
 117   *  @brief Select samples from a sequence.
 118   *  @param sd Pointer to algorithm data. Result will be placed in
 119   *  @c sd->samples.
 120   *  @param num_samples Number of samples to select.
 121   */
 122 template<typename RandomAccessIterator, typename _DifferenceTp>
 123   inline void
 124   determine_samples(PMWMSSortingData<RandomAccessIterator>* sd,
 125                     _DifferenceTp& num_samples)
 126   {
 127     typedef _DifferenceTp difference_type;
 128
 129     thread_index_t iam = omp_get_thread_num();
 130
 131     num_samples =
 132         Settings::sort_mwms_oversampling * sd->num_threads - 1;
 133
 134     difference_type* es = new difference_type[num_samples + 2];
 135
 136     equally_split(sd->starts[iam + 1] - sd->starts[iam],
 137                   num_samples + 1, es);
 138
 139     for (difference_type i = 0; i < num_samples; i++)
 140       sd->samples[iam * num_samples + i] =
 141           sd->source[sd->starts[iam] + es[i + 1]];
 142
 143     delete[] es;
 144   }
 145
 146 /** @brief PMWMS code executed by each thread.
 147   *  @param sd Pointer to algorithm data.
 148   *  @param comp Comparator.
 149   */
 150 template<typename RandomAccessIterator, typename Comparator>
 151   inline void
 152   parallel_sort_mwms_pu(PMWMSSortingData<RandomAccessIterator>* sd,
 153                         Comparator& comp)
 154   {
 155     typedef std::iterator_traits<RandomAccessIterator> traits_type;
 156     typedef typename traits_type::value_type value_type;
 157     typedef typename traits_type::difference_type difference_type;
 158
 159     thread_index_t iam = omp_get_thread_num();
 160
 161     // Length of this thread's chunk, before merging.
 162     difference_type length_local = sd->starts[iam + 1] - sd->starts[iam];
 163
 164 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 165     typedef RandomAccessIterator SortingPlacesIterator;
 166
 167     // Sort in input storage.
 168     sd->sorting_places[iam] = sd->source + sd->starts[iam];
 169 #else
 170     typedef value_type* SortingPlacesIterator;
 171
 172     // Sort in temporary storage, leave space for sentinel.
 173     sd->sorting_places[iam] = sd->temporaries[iam] =
 174         static_cast<value_type*>(
 175         ::operator new(sizeof(value_type) * (length_local + 1)));
 176
 177     // Copy there.
 178     std::uninitialized_copy(sd->source + sd->starts[iam],
 179                             sd->source + sd->starts[iam] + length_local,
 180                             sd->sorting_places[iam]);
 181 #endif
 182
 183     // Sort locally.
 184     if (sd->stable)
 185       __gnu_sequential::stable_sort(sd->sorting_places[iam],
 186                                     sd->sorting_places[iam] + length_local,
 187                                     comp);
 188     else
 189       __gnu_sequential::sort(sd->sorting_places[iam],
 190                              sd->sorting_places[iam] + length_local,
 191                              comp);
 192
 193     // Invariant: locally sorted subsequence in sd->sorting_places[iam],
 194     // sd->sorting_places[iam] + length_local.
 195
 196     if (Settings::sort_splitting == Settings::SAMPLING)
 197       {
 198         difference_type num_samples;
 199         determine_samples(sd, num_samples);
 200
 201 #       pragma omp barrier
 202
 203 #       pragma omp single
 204         __gnu_sequential::sort(sd->samples,
 205                                sd->samples + (num_samples * sd->num_threads),
 206                                comp);
 207
 208 #       pragma omp barrier
 209
 210         for (int s = 0; s < sd->num_threads; s++)
 211           {
 212             // For each sequence.
 213               if (num_samples * iam > 0)
 214                 sd->pieces[iam][s].begin =
 215                     std::lower_bound(sd->sorting_places[s],
 216                         sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
 217                         sd->samples[num_samples * iam],
 218                         comp)
 219                     - sd->sorting_places[s];
 220             else
 221               // Absolute beginning.
 222               sd->pieces[iam][s].begin = 0;
 223
 224             if ((num_samples * (iam + 1)) < (num_samples * sd->num_threads))
 225               sd->pieces[iam][s].end =
 226                   std::lower_bound(sd->sorting_places[s],
 227                                    sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s],
 228                                    sd->samples[num_samples * (iam + 1)], comp)
 229                   - sd->sorting_places[s];
 230             else
 231               // Absolute end.
 232               sd->pieces[iam][s].end = sd->starts[s + 1] - sd->starts[s];
 233             }
 234       }
 235     else if (Settings::sort_splitting == Settings::EXACT)
 236       {
 237 #       pragma omp barrier
 238
 239         std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
 240             seqs(sd->num_threads);
 241         for (int s = 0; s < sd->num_threads; s++)
 242           seqs[s] = std::make_pair(sd->sorting_places[s],
 243                                    sd->sorting_places[s] + sd->starts[s + 1] - sd->starts[s]);
 244
 245         std::vector<SortingPlacesIterator> offsets(sd->num_threads);
 246
 247         // if not last thread
 248         if (iam < sd->num_threads - 1)
 249           multiseq_partition(seqs.begin(), seqs.end(),
 250                              sd->starts[iam + 1], offsets.begin(), comp);
 251
 252         for (int seq = 0; seq < sd->num_threads; seq++)
 253           {
 254             // for each sequence
 255             if (iam < (sd->num_threads - 1))
 256               sd->pieces[iam][seq].end = offsets[seq] - seqs[seq].first;
 257             else
 258               // very end of this sequence
 259               sd->pieces[iam][seq].end = sd->starts[seq + 1] - sd->starts[seq];
 260           }
 261
 262 #       pragma omp barrier
 263
 264         for (int seq = 0; seq < sd->num_threads; seq++)
 265           {
 266             // For each sequence.
 267             if (iam > 0)
 268               sd->pieces[iam][seq].begin = sd->pieces[iam - 1][seq].end;
 269             else
 270               // Absolute beginning.
 271               sd->pieces[iam][seq].begin = 0;
 272           }
 273       }
 274
 275     // Offset from target begin, length after merging.
 276     difference_type offset = 0, length_am = 0;
 277     for (int s = 0; s < sd->num_threads; s++)
 278       {
 279         length_am += sd->pieces[iam][s].end - sd->pieces[iam][s].begin;
 280         offset += sd->pieces[iam][s].begin;
 281       }
 282
 283 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 284     // Merge to temporary storage, uninitialized creation not possible
 285     // since there is no multiway_merge calling the placement new
 286     // instead of the assignment operator.
 287     sd->merging_places[iam] = sd->temporaries[iam] =
 288         static_cast<value_type*>(
 289         ::operator new(sizeof(value_type) * length_am));
 290 #else
 291     // Merge directly to target.
 292     sd->merging_places[iam] = sd->source + offset;
 293 #endif
 294     std::vector<std::pair<SortingPlacesIterator, SortingPlacesIterator> >
 295         seqs(sd->num_threads);
 296
 297     for (int s = 0; s < sd->num_threads; s++)
 298       {
 299         seqs[s] = std::make_pair(sd->sorting_places[s] + sd->pieces[iam][s].begin,
 300                                  sd->sorting_places[s] + sd->pieces[iam][s].end);
 301       }
 302
 303     multiway_merge(seqs.begin(), seqs.end(), sd->merging_places[iam], comp, length_am, sd->stable, false, sequential_tag());
 304
 305 #   pragma omp barrier
 306
 307 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 308     // Write back.
 309     std::copy(sd->merging_places[iam],
 310               sd->merging_places[iam] + length_am,
 311               sd->source + offset);
 312 #endif
 313
 314     delete[] sd->temporaries[iam];
 315   }
 316
 317 /** @brief PMWMS main call.
 318   *  @param begin Begin iterator of sequence.
 319   *  @param end End iterator of sequence.
 320   *  @param comp Comparator.
 321   *  @param n Length of sequence.
 322   *  @param num_threads Number of threads to use.
 323   *  @param stable Stable sorting.
 324   */
 325 template<typename RandomAccessIterator, typename Comparator>
 326   inline void
 327   parallel_sort_mwms(RandomAccessIterator begin, RandomAccessIterator end,
 328                      Comparator comp,
 329                      typename std::iterator_traits<RandomAccessIterator>::difference_type n,
 330                      int num_threads,
 331                      bool stable)
 332   {
 333     _GLIBCXX_CALL(n)
 334
 335     typedef std::iterator_traits<RandomAccessIterator> traits_type;
 336     typedef typename traits_type::value_type value_type;
 337     typedef typename traits_type::difference_type difference_type;
 338
 339     if (n <= 1)
 340       return;
 341
 342     // at least one element per thread
 343     if (num_threads > n)
 344       num_threads = static_cast<thread_index_t>(n);
 345
 346     // shared variables
 347     PMWMSSortingData<RandomAccessIterator> sd;
 348     difference_type* starts;
 349
 350 #   pragma omp parallel num_threads(num_threads)
 351       {
 352         num_threads = omp_get_num_threads();  //no more threads than requested
 353
 354 #       pragma omp single
 355           {
 356             sd.num_threads = num_threads;
 357             sd.source = begin;
 358             sd.temporaries = new value_type*[num_threads];
 359
 360 #if _GLIBCXX_MULTIWAY_MERGESORT_COPY_LAST
 361             sd.sorting_places = new RandomAccessIterator[num_threads];
 362             sd.merging_places = new value_type*[num_threads];
 363 #else
 364             sd.sorting_places = new value_type*[num_threads];
 365             sd.merging_places = new RandomAccessIterator[num_threads];
 366 #endif
 367
 368             if (Settings::sort_splitting == Settings::SAMPLING)
 369               {
 370                 unsigned int size =
 371                     (Settings::sort_mwms_oversampling * num_threads - 1) * num_threads;
 372                 sd.samples = static_cast<value_type*>(
 373                     ::operator new(size * sizeof(value_type)));
 374               }
 375             else
 376               sd.samples = NULL;
 377
 378             sd.offsets = new difference_type[num_threads - 1];
 379             sd.pieces = new std::vector<Piece<difference_type> >[num_threads];
 380             for (int s = 0; s < num_threads; s++)
 381               sd.pieces[s].resize(num_threads);
 382             starts = sd.starts = new difference_type[num_threads + 1];
 383             sd.stable = stable;
 384
 385             difference_type chunk_length = n / num_threads;
 386             difference_type split = n % num_threads;
 387             difference_type pos = 0;
 388             for (int i = 0; i < num_threads; i++)
 389               {
 390                 starts[i] = pos;
 391                 pos += (i < split) ? (chunk_length + 1) : chunk_length;
 392               }
 393             starts[num_threads] = pos;
 394           }
 395
 396         // Now sort in parallel.
 397         parallel_sort_mwms_pu(&sd, comp);
 398       } //parallel
 399
 400     delete[] starts;
 401     delete[] sd.temporaries;
 402     delete[] sd.sorting_places;
 403     delete[] sd.merging_places;
 404
 405     if (Settings::sort_splitting == Settings::SAMPLING)
 406         delete[] sd.samples;
 407
 408     delete[] sd.offsets;
 409     delete[] sd.pieces;
 410   }
 411 } //namespace __gnu_parallel
 412
 413 #endif