Never been to DZone Snippets before?

Snippets is a public source code repository. Easily build up your personal collection of code snippets, categorize them with tags / keywords, and share them with the world

About this user

http://ttt.ggnore.net

« Newer Snippets
Older Snippets »
Showing 1-1 of 1 total  RSS 

A CountingSort parallelized using OpenMP.

CountingSort in C++ parallelized with OpenMP.

   1  
   2  /***************************************************************************
   3   *   Copyright (C) 2007 by Manuel Holtgrewe                                *
   4   *   holtgrewe@ira.uka.de                                                  *
   5   *   Distributed under the Boost Software License, Version 1.0.            *
   6   *   (See accompanying file LICENSE_1_0.txt or copy at                     *
   7   *   http://www.boost.org/LICENSE_1_0.txt)                                 *
   8   ***************************************************************************/
   9  
  10  #include <algorithm>
  11  #include <meta/mcstl_timing.h>
  12  #include <omp.h>
  13  
  14  mcstl::Timing<mcstl::active_tag> timer;
  15  
  16  int benchmark = false;
  17  
  18  /* OpenMP Parallel Counting Sort
  19   *
  20   * A simple implementation of a parallel counting sort algorithm.
  21   *
  22   * The bucket filling and final result sorting have been parallelized. Copying
  23   * back the sorted elements into the input iterators is done using std::copy
  24   * which is already parallelized in the MCSTL.
  25   *
  26   * See:
  27   *  - http://de.wikipedia.org/wiki/Countingsort
  28   *  - http://www.umiacs.umd.edu/research/EXPAR/papers/3548/node8.html
  29   *
  30   * Measurements on a dual CPU, dual core Intel Xeon with 2.4Ghz yielded no speedup
  31   * with more than 2 threads, speed broke down, possibly because of the very simple
  32   * algorithm becoming memory bound (GCC 4.2). Results were the same on a 4 core
  33   * AMD Opteron system. These results hold for array sizes of up to 2**28 (filling
  34   * 2 GB of RAM on a 64bit machine leaving enough room for the buffer and the OS).
  35   *
  36   * See ExperimentalResults.txt.
  37   *
  38   * Parameters
  39   *
  40   * Sorts elements [begin, end) "in place". The elements must be in the range of
  41   * [0, max_key].
  42   */
  43  template <typename RandomIterator, typename HashFunctor>
  44  void counting_sort(RandomIterator *begin, RandomIterator *end, int max_key, HashFunctor hash)
  45  {
  46  	if (benchmark) timer.tic("start");
  47  	
  48  	assert(max_key >= 0);
  49  	
  50  	// number of threads and number of elements to use
  51  	int num_threads = mcstl::HEURISTIC::num_threads;
  52  	int n = end - begin;
  53  	assert(n >= 0);
  54  	
  55  	// *********** STEP 1: COUNTING ***********
  56  	// result of this step: local rank bases
  57  	
  58  	// create one counter array for each thread
  59  	int **thread_counters = new int* [num_threads];
  60  	for (int i = 0; i < num_threads; i++)
  61  		thread_counters[i] = new int[max_key + 1];
  62  	if (benchmark) timer.tic("counting initialization           ");
  63  	
  64  	// count occurences
  65  	int *split_positions = mcstl::equal_splitting(n, num_threads);
  66  	#pragma omp parallel num_threads(num_threads)
  67  	{
  68  		int iam = omp_get_thread_num();
  69  		int *&c = thread_counters[iam];
  70  		
  71  		// reset counters
  72  		for (int i = 0; i <= max_key; i++)
  73  			c[i] = 0;
  74  		
  75  		// count occurences
  76  		for (int i = split_positions[iam]; i < split_positions[iam + 1]; i++)
  77  		{
  78  			c[hash(begin[i])]++;
  79  		}
  80  	}
  81  	if (benchmark) timer.tic("counting & local prefix sums      ");
  82  	
  83  	// Compute global prefix sums / ranks from local ones. We *could*
  84  	// make this parallel, too, but there are only num_threads * (max_key + 1)
  85  	// entries in total.
  86  	for (int i = 0, sum = 0; i <= max_key; i++)
  87  		{
  88  			for (int j = 0; j < num_threads; j++)
  89  				{
  90  					int t = thread_counters[j][i];
  91  					thread_counters[j][i] = sum;
  92  					sum += t;
  93  				}
  94  		}
  95  	if (benchmark) timer.tic("global ranks (prefix sums)        ");
  96  	
  97  	// *********** STEP 2: SORTING ***********
  98  	
  99  	int *buffer = new int[n]; // backbuffer, copied back to input later
 100  	
 101  	// write sorted result to backbuffer
 102  	#pragma omp parallel num_threads(num_threads)
 103  	{
 104  		int iam = omp_get_thread_num();
 105  		int *&c = thread_counters[iam];
 106  		
 107  		for (int i = split_positions[iam]; i < split_positions[iam + 1]; i++)
 108  			{
 109  				assert(hash(begin[i]) <= max_key);
 110  				buffer[c[hash(begin[i])]++] = begin[i];
 111  			}
 112  	}	
 113  	if (benchmark) timer.tic("sorting into backbuffer           ");
 114  	
 115  	// write result from buffer back into input
 116  	std::copy(buffer, buffer + n, begin);
 117  	
 118  	if (benchmark) timer.tic("copying back into input           ");
 119  	
 120  	// cleanup
 121  	delete [] buffer;
 122  
 123  	for (int i = 0; i < num_threads; i++)
 124  		delete [] thread_counters[i];
 125  	delete [] thread_counters;
 126  
 127  	delete [] split_positions;
 128  	if (benchmark) timer.tic("cleanup                           ");
 129  }
« Newer Snippets
Older Snippets »
Showing 1-1 of 1 total  RSS