3 // Copyright (C) 2007 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
43 #include <parallel/basic_iterator.h>
44 #include <bits/stl_algo.h>
46 #include <parallel/parallel.h>
47 #include <parallel/base.h>
48 #include <parallel/random_number.h>
49 #include <parallel/timing.h>
51 namespace __gnu_parallel
53 /** @brief Type to hold the index of a bin.
55 * Since many variables of this type are allocated, it should be
56 * chosen as small as possible.
58 typedef unsigned short bin_index
;
60 /** @brief Data known to every thread participating in
61 __gnu_parallel::parallel_random_shuffle(). */
62 template<typename RandomAccessIterator
>
63 struct DRandomShufflingGlobalData
65 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
66 typedef typename
traits_type::value_type value_type
;
67 typedef typename
traits_type::difference_type difference_type
;
69 /** @brief Begin iterator of the source. */
70 RandomAccessIterator
& source
;
72 /** @brief Temporary arrays for each thread. */
73 value_type
** temporaries
;
75 /** @brief Two-dimensional array to hold the thread-bin distribution.
77 * Dimensions (num_threads + 1) x (num_bins + 1). */
78 difference_type
** dist
;
80 /** @brief Start indexes of the threads' chunks. */
81 difference_type
* starts
;
83 /** @brief Number of the thread that will further process the
85 thread_index_t
* bin_proc
;
87 /** @brief Number of bins to distribute to. */
90 /** @brief Number of bits needed to address the bins. */
93 /** @brief Constructor. */
94 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
98 /** @brief Local data for a thread participating in
99 __gnu_parallel::parallel_random_shuffle().
101 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
104 /** @brief Number of threads participating in total. */
107 /** @brief Number of owning thread. */
110 /** @brief Begin index for bins taken care of by this thread. */
111 bin_index bins_begin
;
113 /** @brief End index for bins taken care of by this thread. */
116 /** @brief Random seed for this thread. */
119 /** @brief Pointer to global data. */
120 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
123 /** @brief Generate a random number in @c [0,2^logp).
124 * @param logp Logarithm (basis 2) of the upper range bound.
125 * @param rng Random number generator to use.
127 template<typename RandomNumberGenerator
>
128 inline int random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
130 return rng
.genrand_bits(logp
);
133 /** @brief Random shuffle code executed by each thread.
134 * @param pus Array of thread-local data records. */
135 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
136 inline void parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* pus
)
138 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
139 typedef typename
traits_type::value_type value_type
;
140 typedef typename
traits_type::difference_type difference_type
;
142 Timing
<sequential_tag
> t
;
145 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[omp_get_thread_num()];
146 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
147 thread_index_t iam
= d
->iam
;
149 // Indexing: dist[bin][processor]
150 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
151 bin_index
* oracles
= new bin_index
[length
];
152 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
153 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
154 value_type
** temporaries
= new value_type
*[d
->num_threads
];
156 // Compute oracles and count appearances.
157 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
159 int num_bits
= sd
->num_bits
;
161 random_number
rng(d
->seed
);
164 for (difference_type i
= 0; i
< length
; i
++)
166 bin_index oracle
= random_number_pow2(num_bits
, rng
);
169 // To allow prefix (partial) sum.
173 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
174 sd
->dist
[b
][iam
+ 1] = dist
[b
];
184 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
185 // total number of items in bin s
186 for (bin_index s
= 0; s
< sd
->num_bins
; s
++)
187 partial_sum(sd
->dist
[s
+ 1], sd
->dist
[s
+ 1] + d
->num_threads
+ 1, sd
->dist
[s
+ 1]);
194 sequence_index_t offset
= 0, global_offset
= 0;
195 for (bin_index s
= 0; s
< d
->bins_begin
; s
++)
196 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
200 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; s
++)
202 for (int t
= 0; t
< d
->num_threads
+ 1; t
++)
203 sd
->dist
[s
+ 1][t
] += offset
;
204 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
207 sd
->temporaries
[iam
] = new value_type
[offset
];
215 // Draw local copies to avoid false sharing.
216 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
217 dist
[b
] = sd
->dist
[b
][iam
];
218 for (bin_index b
= 0; b
< sd
->num_bins
; b
++)
219 bin_proc
[b
] = sd
->bin_proc
[b
];
220 for (thread_index_t t
= 0; t
< d
->num_threads
; t
++)
221 temporaries
[t
] = sd
->temporaries
[t
];
223 RandomAccessIterator source
= sd
->source
;
224 difference_type start
= sd
->starts
[iam
];
226 // Distribute according to oracles, second main loop.
227 for (difference_type i
= 0; i
< length
; i
++)
229 bin_index target_bin
= oracles
[i
];
230 thread_index_t target_p
= bin_proc
[target_bin
];
232 // Last column [d->num_threads] stays unchanged.
233 temporaries
[target_p
][dist
[target_bin
+ 1]++] = *(source
+ i
+ start
);
239 delete[] temporaries
;
247 // Shuffle bins internally.
248 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; b
++)
250 value_type
* begin
= sd
->temporaries
[iam
] + ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
251 * end
= sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
252 sequential_random_shuffle(begin
, end
, rng
);
253 std::copy(begin
, end
, sd
->source
+ global_offset
+ ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
256 delete[] sd
->temporaries
[iam
];
263 /** @brief Round up to the next greater power of 2.
264 * @param x Integer to round up */
266 T
round_up_to_pow2(T x
)
271 return (T
)1 << (log2(x
- 1) + 1);
274 /** @brief Main parallel random shuffle step.
275 * @param begin Begin iterator of sequence.
276 * @param end End iterator of sequence.
277 * @param n Length of sequence.
278 * @param num_threads Number of threads to use.
279 * @param rng Random number generator to use.
281 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
283 parallel_random_shuffle_drs(RandomAccessIterator begin
, RandomAccessIterator end
, typename
std::iterator_traits
<RandomAccessIterator
>::difference_type n
, int num_threads
, RandomNumberGenerator
& rng
)
285 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
286 typedef typename
traits_type::value_type value_type
;
287 typedef typename
traits_type::difference_type difference_type
;
292 num_threads
= static_cast<thread_index_t
>(n
);
294 bin_index num_bins
, num_bins_cache
;
296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
297 // Try the L1 cache first.
300 num_bins_cache
= std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
))));
301 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
303 // No more buckets than TLB entries, power of 2
304 // Power of 2 and at least one element per bin, at most the TLB size.
305 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
307 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
308 // 2 TLB entries needed per bin.
309 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
311 num_bins
= round_up_to_pow2(num_bins
);
313 if (num_bins
< num_bins_cache
)
316 // Now try the L2 cache
318 num_bins_cache
= static_cast<bin_index
>(std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L2_cache_size
/ sizeof(value_type
)))));
319 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
321 // No more buckets than TLB entries, power of 2.
322 num_bins
= static_cast<bin_index
>(std::min(n
, (difference_type
)num_bins_cache
));
323 // Power of 2 and at least one element per bin, at most the TLB size.
324 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
325 // 2 TLB entries needed per bin.
326 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
328 num_bins
= round_up_to_pow2(num_bins
);
329 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
333 num_threads
= std::min((bin_index
)num_threads
, (bin_index
)num_bins
);
335 if (num_threads
<= 1)
336 return sequential_random_shuffle(begin
, end
, rng
);
338 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
340 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>[num_threads
];
342 sd
.temporaries
= new value_type
*[num_threads
];
343 //sd.oracles = new bin_index[n];
344 sd
.dist
= new difference_type
*[num_bins
+ 1];
345 sd
.bin_proc
= new thread_index_t
[num_bins
];
346 for (bin_index b
= 0; b
< num_bins
+ 1; b
++)
347 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
348 for (bin_index b
= 0; b
< (num_bins
+ 1); b
++)
353 difference_type
* starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
355 sd
.num_bins
= num_bins
;
356 sd
.num_bits
= log2(num_bins
);
358 difference_type chunk_length
= n
/ num_threads
, split
= n
% num_threads
, start
= 0;
359 int bin_chunk_length
= num_bins
/ num_threads
, bin_split
= num_bins
% num_threads
;
360 for (int i
= 0; i
< num_threads
; i
++)
363 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
364 int j
= pus
[i
].bins_begin
= bin_cursor
;
366 // Range of bins for this processor.
367 bin_cursor
+= (i
< bin_split
) ? (bin_chunk_length
+ 1) : bin_chunk_length
;
368 pus
[i
].bins_end
= bin_cursor
;
369 for (; j
< bin_cursor
; j
++)
371 pus
[i
].num_threads
= num_threads
;
373 pus
[i
].seed
= rng(std::numeric_limits
<uint32
>::max());
376 starts
[num_threads
] = start
;
378 // Now shuffle in parallel.
379 #pragma omp parallel num_threads(num_threads)
380 parallel_random_shuffle_drs_pu(pus
);
383 delete[] sd
.bin_proc
;
384 for (int s
= 0; s
< (num_bins
+ 1); s
++)
387 delete[] sd
.temporaries
;
392 /** @brief Sequential cache-efficient random shuffle.
393 * @param begin Begin iterator of sequence.
394 * @param end End iterator of sequence.
395 * @param rng Random number generator to use.
397 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
399 sequential_random_shuffle(RandomAccessIterator begin
, RandomAccessIterator end
, RandomNumberGenerator
& rng
)
401 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
402 typedef typename
traits_type::value_type value_type
;
403 typedef typename
traits_type::difference_type difference_type
;
405 difference_type n
= end
- begin
;
407 bin_index num_bins
, num_bins_cache
;
409 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
410 // Try the L1 cache first, must fit into L1.
411 num_bins_cache
= std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
))));
412 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
419 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
421 num_bins
= round_up_to_pow2(num_bins
);
423 if (num_bins
< num_bins_cache
)
426 // Now try the L2 cache, must fit into L2.
427 num_bins_cache
= static_cast<bin_index
>(std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L2_cache_size
/ sizeof(value_type
)))));
428 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
430 // No more buckets than TLB entries, power of 2
431 // Power of 2 and at least one element per bin, at most the TLB size.
432 num_bins
= static_cast<bin_index
>(std::min(n
, (difference_type
)num_bins_cache
));
434 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
435 // 2 TLB entries needed per bin
436 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
438 num_bins
= round_up_to_pow2(num_bins
);
439 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
443 int num_bits
= log2(num_bins
);
447 value_type
* target
= new value_type
[n
];
448 bin_index
* oracles
= new bin_index
[n
];
449 difference_type
* dist0
= new difference_type
[num_bins
+ 1], * dist1
= new difference_type
[num_bins
+ 1];
451 for (int b
= 0; b
< num_bins
+ 1; b
++)
454 Timing
<sequential_tag
> t
;
457 random_number
bitrng(rng(0xFFFFFFFF));
459 for (difference_type i
= 0; i
< n
; i
++)
461 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
464 // To allow prefix (partial) sum.
471 partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
473 for (int b
= 0; b
< num_bins
+ 1; b
++)
478 // Distribute according to oracles.
479 for (difference_type i
= 0; i
< n
; i
++)
480 target
[(dist0
[oracles
[i
]])++] = *(begin
+ i
);
482 for (int b
= 0; b
< num_bins
; b
++)
484 sequential_random_shuffle(target
+ dist1
[b
], target
+ dist1
[b
+ 1],
496 __gnu_sequential::random_shuffle(begin
, end
, rng
);
499 /** @brief Parallel random public call.
500 * @param begin Begin iterator of sequence.
501 * @param end End iterator of sequence.
502 * @param rng Random number generator to use.
504 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
506 parallel_random_shuffle(RandomAccessIterator begin
, RandomAccessIterator end
, RandomNumberGenerator rng
= random_number())
508 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
509 typedef typename
traits_type::difference_type difference_type
;
510 difference_type n
= end
- begin
;
511 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;