CountingSort in C++ parallelized with OpenMP.
1
2 /***************************************************************************
3 * Copyright (C) 2007 by Manuel Holtgrewe *
4 * holtgrewe@ira.uka.de *
5 * Distributed under the Boost Software License, Version 1.0. *
6 * (See accompanying file LICENSE_1_0.txt or copy at *
7 * http://www.boost.org/LICENSE_1_0.txt) *
8 ***************************************************************************/
9
10
11
12
13
14 mcstl::Timing<mcstl::active_tag> timer;
15
16 int benchmark = false;
17
18 /* OpenMP Parallel Counting Sort
19 *
20 * A simple implementation of a parallel counting sort algorithm.
21 *
22 * The bucket filling and final result sorting have been parallelized. Copying
23 * back the sorted elements into the input iterators is done using std::copy
24 * which is already parallelized in the MCSTL.
25 *
26 * See:
27 * - http://de.wikipedia.org/wiki/Countingsort
28 * - http://www.umiacs.umd.edu/research/EXPAR/papers/3548/node8.html
29 *
30 * Measurements on a dual CPU, dual core Intel Xeon with 2.4Ghz yielded no speedup
31 * with more than 2 threads, speed broke down, possibly because of the very simple
32 * algorithm becoming memory bound (GCC 4.2). Results were the same on a 4 core
33 * AMD Opteron system. These results hold for array sizes of up to 2**28 (filling
34 * 2 GB of RAM on a 64bit machine leaving enough room for the buffer and the OS).
35 *
36 * See ExperimentalResults.txt.
37 *
38 * Parameters
39 *
40 * Sorts elements [begin, end) "in place". The elements must be in the range of
41 * [0, max_key].
42 */
43 template <typename RandomIterator, typename HashFunctor>
44 void counting_sort(RandomIterator *begin, RandomIterator *end, int max_key, HashFunctor hash)
45 {
46 if (benchmark) timer.tic("start");
47
48 assert(max_key >= 0);
49
50 // number of threads and number of elements to use
51 int num_threads = mcstl::HEURISTIC::num_threads;
52 int n = end - begin;
53 assert(n >= 0);
54
55 // *********** STEP 1: COUNTING ***********
56 // result of this step: local rank bases
57
58 // create one counter array for each thread
59 int **thread_counters = new int* [num_threads];
60 for (int i = 0; i < num_threads; i++)
61 thread_counters[i] = new int[max_key + 1];
62 if (benchmark) timer.tic("counting initialization ");
63
64 // count occurences
65 int *split_positions = mcstl::equal_splitting(n, num_threads);
66
67 {
68 int iam = omp_get_thread_num();
69 int *&c = thread_counters[iam];
70
71 // reset counters
72 for (int i = 0; i <= max_key; i++)
73 c[i] = 0;
74
75 // count occurences
76 for (int i = split_positions[iam]; i < split_positions[iam + 1]; i++)
77 {
78 c[hash(begin[i])]++;
79 }
80 }
81 if (benchmark) timer.tic("counting & local prefix sums ");
82
83 // Compute global prefix sums / ranks from local ones. We *could*
84 // make this parallel, too, but there are only num_threads * (max_key + 1)
85 // entries in total.
86 for (int i = 0, sum = 0; i <= max_key; i++)
87 {
88 for (int j = 0; j < num_threads; j++)
89 {
90 int t = thread_counters[j][i];
91 thread_counters[j][i] = sum;
92 sum += t;
93 }
94 }
95 if (benchmark) timer.tic("global ranks (prefix sums) ");
96
97 // *********** STEP 2: SORTING ***********
98
99 int *buffer = new int[n]; // backbuffer, copied back to input later
100
101 // write sorted result to backbuffer
102
103 {
104 int iam = omp_get_thread_num();
105 int *&c = thread_counters[iam];
106
107 for (int i = split_positions[iam]; i < split_positions[iam + 1]; i++)
108 {
109 assert(hash(begin[i]) <= max_key);
110 buffer[c[hash(begin[i])]++] = begin[i];
111 }
112 }
113 if (benchmark) timer.tic("sorting into backbuffer ");
114
115 // write result from buffer back into input
116 std::copy(buffer, buffer + n, begin);
117
118 if (benchmark) timer.tic("copying back into input ");
119
120 // cleanup
121 delete [] buffer;
122
123 for (int i = 0; i < num_threads; i++)
124 delete [] thread_counters[i];
125 delete [] thread_counters;
126
127 delete [] split_positions;
128 if (benchmark) timer.tic("cleanup ");
129 }