3 // Copyright (C) 2007, 2008 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
46 namespace __gnu_parallel
48 /** @brief Type to hold the index of a bin.
50 * Since many variables of this type are allocated, it should be
51 * chosen as small as possible.
53 typedef unsigned short bin_index
;
55 /** @brief Data known to every thread participating in
56 __gnu_parallel::parallel_random_shuffle(). */
57 template<typename RandomAccessIterator
>
58 struct DRandomShufflingGlobalData
60 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
61 typedef typename
traits_type::value_type value_type
;
62 typedef typename
traits_type::difference_type difference_type
;
64 /** @brief Begin iterator of the source. */
65 RandomAccessIterator
& source
;
67 /** @brief Temporary arrays for each thread. */
68 value_type
** temporaries
;
70 /** @brief Two-dimensional array to hold the thread-bin distribution.
72 * Dimensions (num_threads + 1) x (num_bins + 1). */
73 difference_type
** dist
;
75 /** @brief Start indexes of the threads' chunks. */
76 difference_type
* starts
;
78 /** @brief Number of the thread that will further process the
80 thread_index_t
* bin_proc
;
82 /** @brief Number of bins to distribute to. */
85 /** @brief Number of bits needed to address the bins. */
88 /** @brief Constructor. */
89 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
93 /** @brief Local data for a thread participating in
94 __gnu_parallel::parallel_random_shuffle().
96 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
99 /** @brief Number of threads participating in total. */
102 /** @brief Begin index for bins taken care of by this thread. */
103 bin_index bins_begin
;
105 /** @brief End index for bins taken care of by this thread. */
108 /** @brief Random seed for this thread. */
111 /** @brief Pointer to global data. */
112 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
115 /** @brief Generate a random number in @c [0,2^logp).
116 * @param logp Logarithm (basis 2) of the upper range bound.
117 * @param rng Random number generator to use.
119 template<typename RandomNumberGenerator
>
121 random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
122 { return rng
.genrand_bits(logp
); }
124 /** @brief Random shuffle code executed by each thread.
125 * @param pus Array of thread-local data records. */
126 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
128 parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
,
129 RandomNumberGenerator
>* pus
)
131 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
132 typedef typename
traits_type::value_type value_type
;
133 typedef typename
traits_type::difference_type difference_type
;
135 thread_index_t iam
= omp_get_thread_num();
136 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[iam
];
137 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
139 // Indexing: dist[bin][processor]
140 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
141 bin_index
* oracles
= new bin_index
[length
];
142 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
143 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
144 value_type
** temporaries
= new value_type
*[d
->num_threads
];
146 // Compute oracles and count appearances.
147 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
149 int num_bits
= sd
->num_bits
;
151 random_number
rng(d
->seed
);
154 for (difference_type i
= 0; i
< length
; ++i
)
156 bin_index oracle
= random_number_pow2(num_bits
, rng
);
159 // To allow prefix (partial) sum.
160 ++(dist
[oracle
+ 1]);
163 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
164 sd
->dist
[b
][iam
+ 1] = dist
[b
];
170 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
171 // total number of items in bin s
172 for (bin_index s
= 0; s
< sd
->num_bins
; ++s
)
173 __gnu_sequential::partial_sum(sd
->dist
[s
+ 1],
174 sd
->dist
[s
+ 1] + d
->num_threads
+ 1,
180 sequence_index_t offset
= 0, global_offset
= 0;
181 for (bin_index s
= 0; s
< d
->bins_begin
; ++s
)
182 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
186 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; ++s
)
188 for (int t
= 0; t
< d
->num_threads
+ 1; ++t
)
189 sd
->dist
[s
+ 1][t
] += offset
;
190 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
193 sd
->temporaries
[iam
] = static_cast<value_type
*>(
194 ::operator new(sizeof(value_type
) * offset
));
198 // Draw local copies to avoid false sharing.
199 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
200 dist
[b
] = sd
->dist
[b
][iam
];
201 for (bin_index b
= 0; b
< sd
->num_bins
; ++b
)
202 bin_proc
[b
] = sd
->bin_proc
[b
];
203 for (thread_index_t t
= 0; t
< d
->num_threads
; ++t
)
204 temporaries
[t
] = sd
->temporaries
[t
];
206 RandomAccessIterator source
= sd
->source
;
207 difference_type start
= sd
->starts
[iam
];
209 // Distribute according to oracles, second main loop.
210 for (difference_type i
= 0; i
< length
; ++i
)
212 bin_index target_bin
= oracles
[i
];
213 thread_index_t target_p
= bin_proc
[target_bin
];
215 // Last column [d->num_threads] stays unchanged.
216 ::new(&(temporaries
[target_p
][dist
[target_bin
+ 1]++]))
217 value_type(*(source
+ i
+ start
));
223 delete[] temporaries
;
227 // Shuffle bins internally.
228 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; ++b
)
231 sd
->temporaries
[iam
] +
232 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
234 sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
235 sequential_random_shuffle(begin
, end
, rng
);
236 std::copy(begin
, end
, sd
->source
+ global_offset
+
237 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
240 ::operator delete(sd
->temporaries
[iam
]);
243 /** @brief Round up to the next greater power of 2.
244 * @param x Integer to round up */
247 round_up_to_pow2(T x
)
252 return (T
)1 << (log2(x
- 1) + 1);
255 /** @brief Main parallel random shuffle step.
256 * @param begin Begin iterator of sequence.
257 * @param end End iterator of sequence.
258 * @param n Length of sequence.
259 * @param num_threads Number of threads to use.
260 * @param rng Random number generator to use.
262 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
264 parallel_random_shuffle_drs(RandomAccessIterator begin
,
265 RandomAccessIterator end
,
266 typename
std::iterator_traits
267 <RandomAccessIterator
>::difference_type n
,
268 thread_index_t num_threads
,
269 RandomNumberGenerator
& rng
)
271 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
272 typedef typename
traits_type::value_type value_type
;
273 typedef typename
traits_type::difference_type difference_type
;
278 num_threads
= static_cast<thread_index_t
>(n
);
280 bin_index num_bins
, num_bins_cache
;
282 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
283 // Try the L1 cache first.
286 num_bins_cache
= std::max
<difference_type
>(
287 1, n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
)));
288 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
290 // No more buckets than TLB entries, power of 2
291 // Power of 2 and at least one element per bin, at most the TLB size.
292 num_bins
= std::min
<difference_type
>(n
, num_bins_cache
);
294 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
295 // 2 TLB entries needed per bin.
296 num_bins
= std::min
<difference_type
>(Settings::TLB_size
/ 2, num_bins
);
298 num_bins
= round_up_to_pow2(num_bins
);
300 if (num_bins
< num_bins_cache
)
303 // Now try the L2 cache
305 num_bins_cache
= static_cast<bin_index
>(std::max
<difference_type
>(
306 1, n
/ (Settings::L2_cache_size
/ sizeof(value_type
))));
307 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
309 // No more buckets than TLB entries, power of 2.
310 num_bins
= static_cast<bin_index
>(
311 std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
312 // Power of 2 and at least one element per bin, at most the TLB size.
313 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
314 // 2 TLB entries needed per bin.
316 static_cast<difference_type
>(Settings::TLB_size
/ 2), num_bins
);
318 num_bins
= round_up_to_pow2(num_bins
);
319 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
323 num_threads
= std::min
<bin_index
>(num_threads
, num_bins
);
325 if (num_threads
<= 1)
326 return sequential_random_shuffle(begin
, end
, rng
);
328 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
329 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
;
330 difference_type
* starts
;
332 # pragma omp parallel num_threads(num_threads)
336 pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>
339 sd
.temporaries
= new value_type
*[num_threads
];
340 sd
.dist
= new difference_type
*[num_bins
+ 1];
341 sd
.bin_proc
= new thread_index_t
[num_bins
];
342 for (bin_index b
= 0; b
< num_bins
+ 1; ++b
)
343 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
344 for (bin_index b
= 0; b
< (num_bins
+ 1); ++b
)
349 starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
351 sd
.num_bins
= num_bins
;
352 sd
.num_bits
= log2(num_bins
);
354 difference_type chunk_length
= n
/ num_threads
,
355 split
= n
% num_threads
, start
= 0;
356 difference_type bin_chunk_length
= num_bins
/ num_threads
,
357 bin_split
= num_bins
% num_threads
;
358 for (thread_index_t i
= 0; i
< num_threads
; ++i
)
361 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
362 int j
= pus
[i
].bins_begin
= bin_cursor
;
364 // Range of bins for this processor.
365 bin_cursor
+= (i
< bin_split
) ?
366 (bin_chunk_length
+ 1) : bin_chunk_length
;
367 pus
[i
].bins_end
= bin_cursor
;
368 for (; j
< bin_cursor
; ++j
)
370 pus
[i
].num_threads
= num_threads
;
371 pus
[i
].seed
= rng(std::numeric_limits
<uint32_t>::max());
374 starts
[num_threads
] = start
;
376 // Now shuffle in parallel.
377 parallel_random_shuffle_drs_pu(pus
);
381 delete[] sd
.bin_proc
;
382 for (int s
= 0; s
< (num_bins
+ 1); ++s
)
385 delete[] sd
.temporaries
;
390 /** @brief Sequential cache-efficient random shuffle.
391 * @param begin Begin iterator of sequence.
392 * @param end End iterator of sequence.
393 * @param rng Random number generator to use.
395 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
397 sequential_random_shuffle(RandomAccessIterator begin
,
398 RandomAccessIterator end
,
399 RandomNumberGenerator
& rng
)
401 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
402 typedef typename
traits_type::value_type value_type
;
403 typedef typename
traits_type::difference_type difference_type
;
405 difference_type n
= end
- begin
;
407 bin_index num_bins
, num_bins_cache
;
409 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
410 // Try the L1 cache first, must fit into L1.
412 std::max
<difference_type
>
413 (1, n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
)));
414 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
416 // No more buckets than TLB entries, power of 2
417 // Power of 2 and at least one element per bin, at most the TLB size
418 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
419 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
420 // 2 TLB entries needed per bin
421 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
423 num_bins
= round_up_to_pow2(num_bins
);
425 if (num_bins
< num_bins_cache
)
428 // Now try the L2 cache, must fit into L2.
430 static_cast<bin_index
>(std::max
<difference_type
>(
431 1, n
/ (Settings::L2_cache_size
/ sizeof(value_type
))));
432 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
434 // No more buckets than TLB entries, power of 2
435 // Power of 2 and at least one element per bin, at most the TLB size.
436 num_bins
= static_cast<bin_index
>
437 (std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
439 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
440 // 2 TLB entries needed per bin
442 std::min
<difference_type
>(Settings::TLB_size
/ 2, num_bins
);
444 num_bins
= round_up_to_pow2(num_bins
);
445 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
449 int num_bits
= log2(num_bins
);
453 value_type
* target
= static_cast<value_type
*>(
454 ::operator new(sizeof(value_type
) * n
));
455 bin_index
* oracles
= new bin_index
[n
];
456 difference_type
* dist0
= new difference_type
[num_bins
+ 1],
457 * dist1
= new difference_type
[num_bins
+ 1];
459 for (int b
= 0; b
< num_bins
+ 1; ++b
)
462 random_number
bitrng(rng(0xFFFFFFFF));
464 for (difference_type i
= 0; i
< n
; ++i
)
466 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
469 // To allow prefix (partial) sum.
470 ++(dist0
[oracle
+ 1]);
474 __gnu_sequential::partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
476 for (int b
= 0; b
< num_bins
+ 1; ++b
)
479 // Distribute according to oracles.
480 for (difference_type i
= 0; i
< n
; ++i
)
481 ::new(&(target
[(dist0
[oracles
[i
]])++])) value_type(*(begin
+ i
));
483 for (int b
= 0; b
< num_bins
; ++b
)
485 sequential_random_shuffle(target
+ dist1
[b
],
486 target
+ dist1
[b
+ 1],
493 ::operator delete(target
);
496 __gnu_sequential::random_shuffle(begin
, end
, rng
);
499 /** @brief Parallel random public call.
500 * @param begin Begin iterator of sequence.
501 * @param end End iterator of sequence.
502 * @param rng Random number generator to use.
504 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
506 parallel_random_shuffle(RandomAccessIterator begin
,
507 RandomAccessIterator end
,
508 RandomNumberGenerator rng
= random_number())
510 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
511 typedef typename
traits_type::difference_type difference_type
;
512 difference_type n
= end
- begin
;
513 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;