PR libstdc++/33489 continued.
[gcc.git] / libstdc++-v3 / include / parallel / random_shuffle.h
1 // -*- C++ -*-
2
3 // Copyright (C) 2007 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
9 // version.
10
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
20
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
29 // Public License.
30
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
34 */
35
36 // Written by Johannes Singler.
37
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
40
41 #include <limits>
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
45 #include <parallel/timing.h>
46
47 namespace __gnu_parallel
48 {
49 /** @brief Type to hold the index of a bin.
50 *
51 * Since many variables of this type are allocated, it should be
52 * chosen as small as possible.
53 */
54 typedef unsigned short bin_index;
55
56 /** @brief Data known to every thread participating in
57 __gnu_parallel::parallel_random_shuffle(). */
58 template<typename RandomAccessIterator>
59 struct DRandomShufflingGlobalData
60 {
61 typedef std::iterator_traits<RandomAccessIterator> traits_type;
62 typedef typename traits_type::value_type value_type;
63 typedef typename traits_type::difference_type difference_type;
64
65 /** @brief Begin iterator of the source. */
66 RandomAccessIterator& source;
67
68 /** @brief Temporary arrays for each thread. */
69 value_type** temporaries;
70
71 /** @brief Two-dimensional array to hold the thread-bin distribution.
72 *
73 * Dimensions (num_threads + 1) x (num_bins + 1). */
74 difference_type** dist;
75
76 /** @brief Start indexes of the threads' chunks. */
77 difference_type* starts;
78
79 /** @brief Number of the thread that will further process the
80 corresponding bin. */
81 thread_index_t* bin_proc;
82
83 /** @brief Number of bins to distribute to. */
84 int num_bins;
85
86 /** @brief Number of bits needed to address the bins. */
87 int num_bits;
88
89 /** @brief Constructor. */
90 DRandomShufflingGlobalData(RandomAccessIterator& _source)
91 : source(_source) { }
92 };
93
94 /** @brief Local data for a thread participating in
95 __gnu_parallel::parallel_random_shuffle().
96 */
97 template<typename RandomAccessIterator, typename RandomNumberGenerator>
98 struct DRSSorterPU
99 {
100 /** @brief Number of threads participating in total. */
101 int num_threads;
102
103 /** @brief Number of owning thread. */
104 int iam;
105
106 /** @brief Begin index for bins taken care of by this thread. */
107 bin_index bins_begin;
108
109 /** @brief End index for bins taken care of by this thread. */
110 bin_index bins_end;
111
112 /** @brief Random seed for this thread. */
113 uint32 seed;
114
115 /** @brief Pointer to global data. */
116 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
117 };
118
119 /** @brief Generate a random number in @c [0,2^logp).
120 * @param logp Logarithm (basis 2) of the upper range bound.
121 * @param rng Random number generator to use.
122 */
123 template<typename RandomNumberGenerator>
124 inline int
125 random_number_pow2(int logp, RandomNumberGenerator& rng)
126 { return rng.genrand_bits(logp); }
127
128 /** @brief Random shuffle code executed by each thread.
129 * @param pus Array of thread-local data records. */
130 template<typename RandomAccessIterator, typename RandomNumberGenerator>
131 inline void
132 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
133 RandomNumberGenerator>* pus)
134 {
135 typedef std::iterator_traits<RandomAccessIterator> traits_type;
136 typedef typename traits_type::value_type value_type;
137 typedef typename traits_type::difference_type difference_type;
138
139 Timing<sequential_tag> t;
140 t.tic();
141
142 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
143 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
144 thread_index_t iam = d->iam;
145
146 // Indexing: dist[bin][processor]
147 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
148 bin_index* oracles = new bin_index[length];
149 difference_type* dist = new difference_type[sd->num_bins + 1];
150 bin_index* bin_proc = new bin_index[sd->num_bins];
151 value_type** temporaries = new value_type*[d->num_threads];
152
153 // Compute oracles and count appearances.
154 for (bin_index b = 0; b < sd->num_bins + 1; b++)
155 dist[b] = 0;
156 int num_bits = sd->num_bits;
157
158 random_number rng(d->seed);
159
160 // First main loop.
161 for (difference_type i = 0; i < length; i++)
162 {
163 bin_index oracle = random_number_pow2(num_bits, rng);
164 oracles[i] = oracle;
165
166 // To allow prefix (partial) sum.
167 dist[oracle + 1]++;
168 }
169
170 for (bin_index b = 0; b < sd->num_bins + 1; b++)
171 sd->dist[b][iam + 1] = dist[b];
172
173 t.tic();
174
175 #pragma omp barrier
176
177 t.tic();
178
179 #pragma omp single
180 {
181 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
182 // total number of items in bin s
183 for (bin_index s = 0; s < sd->num_bins; s++)
184 __gnu_sequential::partial_sum(sd->dist[s + 1],
185 sd->dist[s + 1] + d->num_threads + 1,
186 sd->dist[s + 1]);
187 }
188
189 #pragma omp barrier
190
191 t.tic();
192
193 sequence_index_t offset = 0, global_offset = 0;
194 for (bin_index s = 0; s < d->bins_begin; s++)
195 global_offset += sd->dist[s + 1][d->num_threads];
196
197 #pragma omp barrier
198
199 for (bin_index s = d->bins_begin; s < d->bins_end; s++)
200 {
201 for (int t = 0; t < d->num_threads + 1; t++)
202 sd->dist[s + 1][t] += offset;
203 offset = sd->dist[s + 1][d->num_threads];
204 }
205
206 sd->temporaries[iam] = static_cast<value_type*>(::operator new(sizeof(value_type) * offset));
207
208 t.tic();
209
210 #pragma omp barrier
211
212 t.tic();
213
214 // Draw local copies to avoid false sharing.
215 for (bin_index b = 0; b < sd->num_bins + 1; b++)
216 dist[b] = sd->dist[b][iam];
217 for (bin_index b = 0; b < sd->num_bins; b++)
218 bin_proc[b] = sd->bin_proc[b];
219 for (thread_index_t t = 0; t < d->num_threads; t++)
220 temporaries[t] = sd->temporaries[t];
221
222 RandomAccessIterator source = sd->source;
223 difference_type start = sd->starts[iam];
224
225 // Distribute according to oracles, second main loop.
226 for (difference_type i = 0; i < length; i++)
227 {
228 bin_index target_bin = oracles[i];
229 thread_index_t target_p = bin_proc[target_bin];
230
231 // Last column [d->num_threads] stays unchanged.
232 temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
233 }
234
235 delete[] oracles;
236 delete[] dist;
237 delete[] bin_proc;
238 delete[] temporaries;
239
240 t.tic();
241
242 #pragma omp barrier
243
244 t.tic();
245
246 // Shuffle bins internally.
247 for (bin_index b = d->bins_begin; b < d->bins_end; b++)
248 {
249 value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
250 * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
251 sequential_random_shuffle(begin, end, rng);
252 std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
253 }
254
255 delete[] sd->temporaries[iam];
256
257 t.tic();
258
259 t.print();
260 }
261
262 /** @brief Round up to the next greater power of 2.
263 * @param x Integer to round up */
264 template<typename T>
265 T
266 round_up_to_pow2(T x)
267 {
268 if (x <= 1)
269 return 1;
270 else
271 return (T)1 << (log2(x - 1) + 1);
272 }
273
274 /** @brief Main parallel random shuffle step.
275 * @param begin Begin iterator of sequence.
276 * @param end End iterator of sequence.
277 * @param n Length of sequence.
278 * @param num_threads Number of threads to use.
279 * @param rng Random number generator to use.
280 */
281 template<typename RandomAccessIterator, typename RandomNumberGenerator>
282 inline void
283 parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
284 {
285 typedef std::iterator_traits<RandomAccessIterator> traits_type;
286 typedef typename traits_type::value_type value_type;
287 typedef typename traits_type::difference_type difference_type;
288
289 _GLIBCXX_CALL(n)
290
291 if (num_threads > n)
292 num_threads = static_cast<thread_index_t>(n);
293
294 bin_index num_bins, num_bins_cache;
295
296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
297 // Try the L1 cache first.
298
299 // Must fit into L1.
300 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
301 num_bins_cache = round_up_to_pow2(num_bins_cache);
302
303 // No more buckets than TLB entries, power of 2
304 // Power of 2 and at least one element per bin, at most the TLB size.
305 num_bins = std::min(n, (difference_type)num_bins_cache);
306
307 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
308 // 2 TLB entries needed per bin.
309 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
310 #endif
311 num_bins = round_up_to_pow2(num_bins);
312
313 if (num_bins < num_bins_cache)
314 {
315 #endif
316 // Now try the L2 cache
317 // Must fit into L2
318 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
319 num_bins_cache = round_up_to_pow2(num_bins_cache);
320
321 // No more buckets than TLB entries, power of 2.
322 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
323 // Power of 2 and at least one element per bin, at most the TLB size.
324 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
325 // 2 TLB entries needed per bin.
326 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
327 #endif
328 num_bins = round_up_to_pow2(num_bins);
329 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
330 }
331 #endif
332
333 num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
334
335 if (num_threads <= 1)
336 return sequential_random_shuffle(begin, end, rng);
337
338 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
339
340 DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
341
342 sd.temporaries = new value_type*[num_threads];
343 //sd.oracles = new bin_index[n];
344 sd.dist = new difference_type*[num_bins + 1];
345 sd.bin_proc = new thread_index_t[num_bins];
346 for (bin_index b = 0; b < num_bins + 1; b++)
347 sd.dist[b] = new difference_type[num_threads + 1];
348 for (bin_index b = 0; b < (num_bins + 1); b++)
349 {
350 sd.dist[0][0] = 0;
351 sd.dist[b][0] = 0;
352 }
353 difference_type* starts = sd.starts = new difference_type[num_threads + 1];
354 int bin_cursor = 0;
355 sd.num_bins = num_bins;
356 sd.num_bits = log2(num_bins);
357
358 difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
359 int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
360 for (int i = 0; i < num_threads; i++)
361 {
362 starts[i] = start;
363 start += (i < split) ? (chunk_length + 1) : chunk_length;
364 int j = pus[i].bins_begin = bin_cursor;
365
366 // Range of bins for this processor.
367 bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
368 pus[i].bins_end = bin_cursor;
369 for (; j < bin_cursor; j++)
370 sd.bin_proc[j] = i;
371 pus[i].num_threads = num_threads;
372 pus[i].iam = i;
373 pus[i].seed = rng(std::numeric_limits<uint32>::max());
374 pus[i].sd = &sd;
375 }
376 starts[num_threads] = start;
377
378 // Now shuffle in parallel.
379 #pragma omp parallel num_threads(num_threads)
380 parallel_random_shuffle_drs_pu(pus);
381
382 delete[] starts;
383 delete[] sd.bin_proc;
384 for (int s = 0; s < (num_bins + 1); s++)
385 delete[] sd.dist[s];
386 delete[] sd.dist;
387 delete[] sd.temporaries;
388
389 delete[] pus;
390 }
391
392 /** @brief Sequential cache-efficient random shuffle.
393 * @param begin Begin iterator of sequence.
394 * @param end End iterator of sequence.
395 * @param rng Random number generator to use.
396 */
397 template<typename RandomAccessIterator, typename RandomNumberGenerator>
398 inline void
399 sequential_random_shuffle(RandomAccessIterator begin,
400 RandomAccessIterator end,
401 RandomNumberGenerator& rng)
402 {
403 typedef std::iterator_traits<RandomAccessIterator> traits_type;
404 typedef typename traits_type::value_type value_type;
405 typedef typename traits_type::difference_type difference_type;
406
407 difference_type n = end - begin;
408
409 bin_index num_bins, num_bins_cache;
410
411 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
412 // Try the L1 cache first, must fit into L1.
413 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
414 num_bins_cache = round_up_to_pow2(num_bins_cache);
415
416 // No more buckets than TLB entries, power of 2
417 // Power of 2 and at least one element per bin, at most the TLB size
418 num_bins = std::min(n, (difference_type)num_bins_cache);
419 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
420 // 2 TLB entries needed per bin
421 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
422 #endif
423 num_bins = round_up_to_pow2(num_bins);
424
425 if (num_bins < num_bins_cache)
426 {
427 #endif
428 // Now try the L2 cache, must fit into L2.
429 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
430 num_bins_cache = round_up_to_pow2(num_bins_cache);
431
432 // No more buckets than TLB entries, power of 2
433 // Power of 2 and at least one element per bin, at most the TLB size.
434 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
435
436 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
437 // 2 TLB entries needed per bin
438 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
439 #endif
440 num_bins = round_up_to_pow2(num_bins);
441 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
442 }
443 #endif
444
445 int num_bits = log2(num_bins);
446
447 if (num_bins > 1)
448 {
449 value_type* target = static_cast<value_type*>(::operator new(sizeof(value_type) * n));
450 bin_index* oracles = new bin_index[n];
451 difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
452
453 for (int b = 0; b < num_bins + 1; b++)
454 dist0[b] = 0;
455
456 Timing<sequential_tag> t;
457 t.tic();
458
459 random_number bitrng(rng(0xFFFFFFFF));
460
461 for (difference_type i = 0; i < n; i++)
462 {
463 bin_index oracle = random_number_pow2(num_bits, bitrng);
464 oracles[i] = oracle;
465
466 // To allow prefix (partial) sum.
467 dist0[oracle + 1]++;
468 }
469
470 t.tic();
471
472 // Sum up bins.
473 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
474
475 for (int b = 0; b < num_bins + 1; b++)
476 dist1[b] = dist0[b];
477
478 t.tic();
479
480 // Distribute according to oracles.
481 for (difference_type i = 0; i < n; i++)
482 target[(dist0[oracles[i]])++] = *(begin + i);
483
484 for (int b = 0; b < num_bins; b++)
485 {
486 sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
487 rng);
488 t.tic();
489 }
490 t.print();
491
492 delete[] dist0;
493 delete[] dist1;
494 delete[] oracles;
495 delete[] target;
496 }
497 else
498 __gnu_sequential::random_shuffle(begin, end, rng);
499 }
500
501 /** @brief Parallel random public call.
502 * @param begin Begin iterator of sequence.
503 * @param end End iterator of sequence.
504 * @param rng Random number generator to use.
505 */
506 template<typename RandomAccessIterator, typename RandomNumberGenerator>
507 inline void
508 parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end,
509 RandomNumberGenerator rng = random_number())
510 {
511 typedef std::iterator_traits<RandomAccessIterator> traits_type;
512 typedef typename traits_type::difference_type difference_type;
513 difference_type n = end - begin;
514 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
515 }
516
517 }
518
519 #endif