3 // Copyright (C) 2007 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
45 #include <parallel/timing.h>
47 namespace __gnu_parallel
49 /** @brief Type to hold the index of a bin.
51 * Since many variables of this type are allocated, it should be
52 * chosen as small as possible.
54 typedef unsigned short bin_index
;
56 /** @brief Data known to every thread participating in
57 __gnu_parallel::parallel_random_shuffle(). */
58 template<typename RandomAccessIterator
>
59 struct DRandomShufflingGlobalData
61 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
62 typedef typename
traits_type::value_type value_type
;
63 typedef typename
traits_type::difference_type difference_type
;
65 /** @brief Begin iterator of the source. */
66 RandomAccessIterator
& source
;
68 /** @brief Temporary arrays for each thread. */
69 value_type
** temporaries
;
71 /** @brief Two-dimensional array to hold the thread-bin distribution.
73 * Dimensions (num_threads + 1) x (num_bins + 1). */
74 difference_type
** dist
;
76 /** @brief Start indexes of the threads' chunks. */
77 difference_type
* starts
;
79 /** @brief Number of the thread that will further process the
81 thread_index_t
* bin_proc
;
83 /** @brief Number of bins to distribute to. */
86 /** @brief Number of bits needed to address the bins. */
89 /** @brief Constructor. */
90 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
94 /** @brief Local data for a thread participating in
95 __gnu_parallel::parallel_random_shuffle().
97 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
100 /** @brief Number of threads participating in total. */
103 /** @brief Number of owning thread. */
106 /** @brief Begin index for bins taken care of by this thread. */
107 bin_index bins_begin
;
109 /** @brief End index for bins taken care of by this thread. */
112 /** @brief Random seed for this thread. */
115 /** @brief Pointer to global data. */
116 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
119 /** @brief Generate a random number in @c [0,2^logp).
120 * @param logp Logarithm (basis 2) of the upper range bound.
121 * @param rng Random number generator to use.
123 template<typename RandomNumberGenerator
>
125 random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
126 { return rng
.genrand_bits(logp
); }
128 /** @brief Random shuffle code executed by each thread.
129 * @param pus Array of thread-local data records. */
130 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
132 parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
,
133 RandomNumberGenerator
>* pus
)
135 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
136 typedef typename
traits_type::value_type value_type
;
137 typedef typename
traits_type::difference_type difference_type
;
139 Timing
<sequential_tag
> t
;
142 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[omp_get_thread_num()];
143 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
144 thread_index_t iam
= d
->iam
;
146 // Indexing: dist[bin][processor]
147 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
148 bin_index
* oracles
= new bin_index
[length
];
149 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
150 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
151 value_type
** temporaries
= new value_type
*[d
->num_threads
];
153 // Compute oracles and count appearances.
154 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
156 int num_bits
= sd
->num_bits
;
158 random_number
rng(d
->seed
);
161 for (difference_type i
= 0; i
< length
; i
++)
163 bin_index oracle
= random_number_pow2(num_bits
, rng
);
166 // To allow prefix (partial) sum.
170 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
171 sd
->dist
[b
][iam
+ 1] = dist
[b
];
181 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
182 // total number of items in bin s
183 for (bin_index s
= 0; s
< sd
->num_bins
; s
++)
184 __gnu_sequential::partial_sum(sd
->dist
[s
+ 1],
185 sd
->dist
[s
+ 1] + d
->num_threads
+ 1,
193 sequence_index_t offset
= 0, global_offset
= 0;
194 for (bin_index s
= 0; s
< d
->bins_begin
; s
++)
195 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
199 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; s
++)
201 for (int t
= 0; t
< d
->num_threads
+ 1; t
++)
202 sd
->dist
[s
+ 1][t
] += offset
;
203 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
206 sd
->temporaries
[iam
] = static_cast<value_type
*>(::operator new(sizeof(value_type
) * offset
));
214 // Draw local copies to avoid false sharing.
215 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
216 dist
[b
] = sd
->dist
[b
][iam
];
217 for (bin_index b
= 0; b
< sd
->num_bins
; b
++)
218 bin_proc
[b
] = sd
->bin_proc
[b
];
219 for (thread_index_t t
= 0; t
< d
->num_threads
; t
++)
220 temporaries
[t
] = sd
->temporaries
[t
];
222 RandomAccessIterator source
= sd
->source
;
223 difference_type start
= sd
->starts
[iam
];
225 // Distribute according to oracles, second main loop.
226 for (difference_type i
= 0; i
< length
; i
++)
228 bin_index target_bin
= oracles
[i
];
229 thread_index_t target_p
= bin_proc
[target_bin
];
231 // Last column [d->num_threads] stays unchanged.
232 temporaries
[target_p
][dist
[target_bin
+ 1]++] = *(source
+ i
+ start
);
238 delete[] temporaries
;
246 // Shuffle bins internally.
247 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; b
++)
249 value_type
* begin
= sd
->temporaries
[iam
] + ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
250 * end
= sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
251 sequential_random_shuffle(begin
, end
, rng
);
252 std::copy(begin
, end
, sd
->source
+ global_offset
+ ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
255 delete[] sd
->temporaries
[iam
];
262 /** @brief Round up to the next greater power of 2.
263 * @param x Integer to round up */
266 round_up_to_pow2(T x
)
271 return (T
)1 << (log2(x
- 1) + 1);
274 /** @brief Main parallel random shuffle step.
275 * @param begin Begin iterator of sequence.
276 * @param end End iterator of sequence.
277 * @param n Length of sequence.
278 * @param num_threads Number of threads to use.
279 * @param rng Random number generator to use.
281 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
283 parallel_random_shuffle_drs(RandomAccessIterator begin
, RandomAccessIterator end
, typename
std::iterator_traits
<RandomAccessIterator
>::difference_type n
, int num_threads
, RandomNumberGenerator
& rng
)
285 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
286 typedef typename
traits_type::value_type value_type
;
287 typedef typename
traits_type::difference_type difference_type
;
292 num_threads
= static_cast<thread_index_t
>(n
);
294 bin_index num_bins
, num_bins_cache
;
296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
297 // Try the L1 cache first.
300 num_bins_cache
= std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
))));
301 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
303 // No more buckets than TLB entries, power of 2
304 // Power of 2 and at least one element per bin, at most the TLB size.
305 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
307 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
308 // 2 TLB entries needed per bin.
309 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
311 num_bins
= round_up_to_pow2(num_bins
);
313 if (num_bins
< num_bins_cache
)
316 // Now try the L2 cache
318 num_bins_cache
= static_cast<bin_index
>(std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L2_cache_size
/ sizeof(value_type
)))));
319 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
321 // No more buckets than TLB entries, power of 2.
322 num_bins
= static_cast<bin_index
>(std::min(n
, (difference_type
)num_bins_cache
));
323 // Power of 2 and at least one element per bin, at most the TLB size.
324 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
325 // 2 TLB entries needed per bin.
326 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
328 num_bins
= round_up_to_pow2(num_bins
);
329 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
333 num_threads
= std::min((bin_index
)num_threads
, (bin_index
)num_bins
);
335 if (num_threads
<= 1)
336 return sequential_random_shuffle(begin
, end
, rng
);
338 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
340 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>[num_threads
];
342 sd
.temporaries
= new value_type
*[num_threads
];
343 //sd.oracles = new bin_index[n];
344 sd
.dist
= new difference_type
*[num_bins
+ 1];
345 sd
.bin_proc
= new thread_index_t
[num_bins
];
346 for (bin_index b
= 0; b
< num_bins
+ 1; b
++)
347 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
348 for (bin_index b
= 0; b
< (num_bins
+ 1); b
++)
353 difference_type
* starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
355 sd
.num_bins
= num_bins
;
356 sd
.num_bits
= log2(num_bins
);
358 difference_type chunk_length
= n
/ num_threads
, split
= n
% num_threads
, start
= 0;
359 int bin_chunk_length
= num_bins
/ num_threads
, bin_split
= num_bins
% num_threads
;
360 for (int i
= 0; i
< num_threads
; i
++)
363 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
364 int j
= pus
[i
].bins_begin
= bin_cursor
;
366 // Range of bins for this processor.
367 bin_cursor
+= (i
< bin_split
) ? (bin_chunk_length
+ 1) : bin_chunk_length
;
368 pus
[i
].bins_end
= bin_cursor
;
369 for (; j
< bin_cursor
; j
++)
371 pus
[i
].num_threads
= num_threads
;
373 pus
[i
].seed
= rng(std::numeric_limits
<uint32
>::max());
376 starts
[num_threads
] = start
;
378 // Now shuffle in parallel.
379 #pragma omp parallel num_threads(num_threads)
380 parallel_random_shuffle_drs_pu(pus
);
383 delete[] sd
.bin_proc
;
384 for (int s
= 0; s
< (num_bins
+ 1); s
++)
387 delete[] sd
.temporaries
;
392 /** @brief Sequential cache-efficient random shuffle.
393 * @param begin Begin iterator of sequence.
394 * @param end End iterator of sequence.
395 * @param rng Random number generator to use.
397 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
399 sequential_random_shuffle(RandomAccessIterator begin
,
400 RandomAccessIterator end
,
401 RandomNumberGenerator
& rng
)
403 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
404 typedef typename
traits_type::value_type value_type
;
405 typedef typename
traits_type::difference_type difference_type
;
407 difference_type n
= end
- begin
;
409 bin_index num_bins
, num_bins_cache
;
411 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
412 // Try the L1 cache first, must fit into L1.
413 num_bins_cache
= std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
))));
414 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
416 // No more buckets than TLB entries, power of 2
417 // Power of 2 and at least one element per bin, at most the TLB size
418 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
419 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
420 // 2 TLB entries needed per bin
421 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
423 num_bins
= round_up_to_pow2(num_bins
);
425 if (num_bins
< num_bins_cache
)
428 // Now try the L2 cache, must fit into L2.
429 num_bins_cache
= static_cast<bin_index
>(std::max((difference_type
)1, (difference_type
)(n
/ (Settings::L2_cache_size
/ sizeof(value_type
)))));
430 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
432 // No more buckets than TLB entries, power of 2
433 // Power of 2 and at least one element per bin, at most the TLB size.
434 num_bins
= static_cast<bin_index
>(std::min(n
, (difference_type
)num_bins_cache
));
436 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
437 // 2 TLB entries needed per bin
438 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
440 num_bins
= round_up_to_pow2(num_bins
);
441 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
445 int num_bits
= log2(num_bins
);
449 value_type
* target
= static_cast<value_type
*>(::operator new(sizeof(value_type
) * n
));
450 bin_index
* oracles
= new bin_index
[n
];
451 difference_type
* dist0
= new difference_type
[num_bins
+ 1], * dist1
= new difference_type
[num_bins
+ 1];
453 for (int b
= 0; b
< num_bins
+ 1; b
++)
456 Timing
<sequential_tag
> t
;
459 random_number
bitrng(rng(0xFFFFFFFF));
461 for (difference_type i
= 0; i
< n
; i
++)
463 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
466 // To allow prefix (partial) sum.
473 __gnu_sequential::partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
475 for (int b
= 0; b
< num_bins
+ 1; b
++)
480 // Distribute according to oracles.
481 for (difference_type i
= 0; i
< n
; i
++)
482 target
[(dist0
[oracles
[i
]])++] = *(begin
+ i
);
484 for (int b
= 0; b
< num_bins
; b
++)
486 sequential_random_shuffle(target
+ dist1
[b
], target
+ dist1
[b
+ 1],
498 __gnu_sequential::random_shuffle(begin
, end
, rng
);
501 /** @brief Parallel random public call.
502 * @param begin Begin iterator of sequence.
503 * @param end End iterator of sequence.
504 * @param rng Random number generator to use.
506 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
508 parallel_random_shuffle(RandomAccessIterator begin
, RandomAccessIterator end
,
509 RandomNumberGenerator rng
= random_number())
511 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
512 typedef typename
traits_type::difference_type difference_type
;
513 difference_type n
= end
- begin
;
514 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;