| // -*- C++ -*- |
| |
| // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. |
| // |
| // This file is part of the GNU ISO C++ Library. This library is free |
| // software; you can redistribute it and/or modify it under the terms |
| // of the GNU General Public License as published by the Free Software |
| // Foundation; either version 3, or (at your option) any later |
| // version. |
| |
| // This library is distributed in the hope that it will be useful, but |
| // WITHOUT ANY WARRANTY; without even the implied warranty of |
| // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| // General Public License for more details. |
| |
| // Under Section 7 of GPL version 3, you are granted additional |
| // permissions described in the GCC Runtime Library Exception, version |
| // 3.1, as published by the Free Software Foundation. |
| |
| // You should have received a copy of the GNU General Public License and |
| // a copy of the GCC Runtime Library Exception along with this program; |
| // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see |
| // <http://www.gnu.org/licenses/>. |
| |
| /** @file parallel/balanced_quicksort.h |
| * @brief Implementation of a dynamically load-balanced parallel quicksort. |
| * |
| * It works in-place and needs only logarithmic extra memory. |
| * The algorithm is similar to the one proposed in |
| * |
| * P. Tsigas and Y. Zhang. |
| * A simple, fast parallel implementation of quicksort and |
| * its performance evaluation on SUN enterprise 10000. |
| * In 11th Euromicro Conference on Parallel, Distributed and |
| * Network-Based Processing, page 372, 2003. |
| * |
| * This file is a GNU parallel extension to the Standard C++ Library. |
| */ |
| |
| // Written by Johannes Singler. |
| |
| #ifndef _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H |
| #define _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H 1 |
| |
| #include <parallel/basic_iterator.h> |
| #include <bits/stl_algo.h> |
| |
| #include <parallel/settings.h> |
| #include <parallel/partition.h> |
| #include <parallel/random_number.h> |
| #include <parallel/queue.h> |
| #include <functional> |
| |
| #if _GLIBCXX_ASSERTIONS |
| #include <parallel/checkers.h> |
| #endif |
| |
| namespace __gnu_parallel |
| { |
| /** @brief Information local to one thread in the parallel quicksort run. */ |
| template<typename RandomAccessIterator> |
| struct QSBThreadLocal |
| { |
| typedef std::iterator_traits<RandomAccessIterator> traits_type; |
| typedef typename traits_type::difference_type difference_type; |
| |
| /** @brief Continuous part of the sequence, described by an |
| iterator pair. */ |
| typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; |
| |
| /** @brief Initial piece to work on. */ |
| Piece initial; |
| |
| /** @brief Work-stealing queue. */ |
| RestrictedBoundedConcurrentQueue<Piece> leftover_parts; |
| |
| /** @brief Number of threads involved in this algorithm. */ |
| thread_index_t num_threads; |
| |
| /** @brief Pointer to a counter of elements left over to sort. */ |
| volatile difference_type* elements_leftover; |
| |
| /** @brief The complete sequence to sort. */ |
| Piece global; |
| |
| /** @brief Constructor. |
| * @param queue_size Size of the work-stealing queue. */ |
| QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } |
| }; |
| |
| /** @brief Balanced quicksort divide step. |
| * @param begin Begin iterator of subsequence. |
| * @param end End iterator of subsequence. |
| * @param comp Comparator. |
| * @param num_threads Number of threads that are allowed to work on |
| * this part. |
| * @pre @c (end-begin)>=1 */ |
| template<typename RandomAccessIterator, typename Comparator> |
| typename std::iterator_traits<RandomAccessIterator>::difference_type |
| qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, |
| Comparator comp, thread_index_t num_threads) |
| { |
| _GLIBCXX_PARALLEL_ASSERT(num_threads > 0); |
| |
| typedef std::iterator_traits<RandomAccessIterator> traits_type; |
| typedef typename traits_type::value_type value_type; |
| typedef typename traits_type::difference_type difference_type; |
| |
| RandomAccessIterator pivot_pos = |
| median_of_three_iterators(begin, begin + (end - begin) / 2, |
| end - 1, comp); |
| |
| #if defined(_GLIBCXX_ASSERTIONS) |
| // Must be in between somewhere. |
| difference_type n = end - begin; |
| |
| _GLIBCXX_PARALLEL_ASSERT( |
| (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) |
| || (!comp(*pivot_pos, *begin) && !comp(*(end - 1), *pivot_pos)) |
| || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) |
| || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*(end - 1), *pivot_pos)) |
| || (!comp(*pivot_pos, *(end - 1)) && !comp(*begin, *pivot_pos)) |
| || (!comp(*pivot_pos, *(end - 1)) && !comp(*(begin + n / 2), *pivot_pos))); |
| #endif |
| |
| // Swap pivot value to end. |
| if (pivot_pos != (end - 1)) |
| std::swap(*pivot_pos, *(end - 1)); |
| pivot_pos = end - 1; |
| |
| __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> |
| pred(comp, *pivot_pos); |
| |
| // Divide, returning end - begin - 1 in the worst case. |
| difference_type split_pos = parallel_partition( |
| begin, end - 1, pred, num_threads); |
| |
| // Swap back pivot to middle. |
| std::swap(*(begin + split_pos), *pivot_pos); |
| pivot_pos = begin + split_pos; |
| |
| #if _GLIBCXX_ASSERTIONS |
| RandomAccessIterator r; |
| for (r = begin; r != pivot_pos; ++r) |
| _GLIBCXX_PARALLEL_ASSERT(comp(*r, *pivot_pos)); |
| for (; r != end; ++r) |
| _GLIBCXX_PARALLEL_ASSERT(!comp(*r, *pivot_pos)); |
| #endif |
| |
| return split_pos; |
| } |
| |
| /** @brief Quicksort conquer step. |
| * @param tls Array of thread-local storages. |
| * @param begin Begin iterator of subsequence. |
| * @param end End iterator of subsequence. |
| * @param comp Comparator. |
| * @param iam Number of the thread processing this function. |
| * @param num_threads |
| * Number of threads that are allowed to work on this part. */ |
| template<typename RandomAccessIterator, typename Comparator> |
| void |
| qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls, |
| RandomAccessIterator begin, RandomAccessIterator end, |
| Comparator comp, |
| thread_index_t iam, thread_index_t num_threads, |
| bool parent_wait) |
| { |
| typedef std::iterator_traits<RandomAccessIterator> traits_type; |
| typedef typename traits_type::value_type value_type; |
| typedef typename traits_type::difference_type difference_type; |
| |
| difference_type n = end - begin; |
| |
| if (num_threads <= 1 || n <= 1) |
| { |
| tls[iam]->initial.first = begin; |
| tls[iam]->initial.second = end; |
| |
| qsb_local_sort_with_helping(tls, comp, iam, parent_wait); |
| |
| return; |
| } |
| |
| // Divide step. |
| difference_type split_pos = qsb_divide(begin, end, comp, num_threads); |
| |
| #if _GLIBCXX_ASSERTIONS |
| _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); |
| #endif |
| |
| thread_index_t num_threads_leftside = |
| std::max<thread_index_t>(1, std::min<thread_index_t>( |
| num_threads - 1, split_pos * num_threads / n)); |
| |
| # pragma omp atomic |
| *tls[iam]->elements_leftover -= (difference_type)1; |
| |
| // Conquer step. |
| # pragma omp parallel num_threads(2) |
| { |
| bool wait; |
| if(omp_get_num_threads() < 2) |
| wait = false; |
| else |
| wait = parent_wait; |
| |
| # pragma omp sections |
| { |
| # pragma omp section |
| { |
| qsb_conquer(tls, begin, begin + split_pos, comp, |
| iam, |
| num_threads_leftside, |
| wait); |
| wait = parent_wait; |
| } |
| // The pivot_pos is left in place, to ensure termination. |
| # pragma omp section |
| { |
| qsb_conquer(tls, begin + split_pos + 1, end, comp, |
| iam + num_threads_leftside, |
| num_threads - num_threads_leftside, |
| wait); |
| wait = parent_wait; |
| } |
| } |
| } |
| } |
| |
| /** |
| * @brief Quicksort step doing load-balanced local sort. |
| * @param tls Array of thread-local storages. |
| * @param comp Comparator. |
| * @param iam Number of the thread processing this function. |
| */ |
| template<typename RandomAccessIterator, typename Comparator> |
| void |
| qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls, |
| Comparator& comp, int iam, bool wait) |
| { |
| typedef std::iterator_traits<RandomAccessIterator> traits_type; |
| typedef typename traits_type::value_type value_type; |
| typedef typename traits_type::difference_type difference_type; |
| typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; |
| |
| QSBThreadLocal<RandomAccessIterator>& tl = *tls[iam]; |
| |
| difference_type base_case_n = |
| _Settings::get().sort_qsb_base_case_maximal_n; |
| if (base_case_n < 2) |
| base_case_n = 2; |
| thread_index_t num_threads = tl.num_threads; |
| |
| // Every thread has its own random number generator. |
| random_number rng(iam + 1); |
| |
| Piece current = tl.initial; |
| |
| difference_type elements_done = 0; |
| #if _GLIBCXX_ASSERTIONS |
| difference_type total_elements_done = 0; |
| #endif |
| |
| for (;;) |
| { |
| // Invariant: current must be a valid (maybe empty) range. |
| RandomAccessIterator begin = current.first, end = current.second; |
| difference_type n = end - begin; |
| |
| if (n > base_case_n) |
| { |
| // Divide. |
| RandomAccessIterator pivot_pos = begin + rng(n); |
| |
| // Swap pivot_pos value to end. |
| if (pivot_pos != (end - 1)) |
| std::swap(*pivot_pos, *(end - 1)); |
| pivot_pos = end - 1; |
| |
| __gnu_parallel::binder2nd |
| <Comparator, value_type, value_type, bool> |
| pred(comp, *pivot_pos); |
| |
| // Divide, leave pivot unchanged in last place. |
| RandomAccessIterator split_pos1, split_pos2; |
| split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); |
| |
| // Left side: < pivot_pos; right side: >= pivot_pos. |
| #if _GLIBCXX_ASSERTIONS |
| _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); |
| #endif |
| // Swap pivot back to middle. |
| if (split_pos1 != pivot_pos) |
| std::swap(*split_pos1, *pivot_pos); |
| pivot_pos = split_pos1; |
| |
| // In case all elements are equal, split_pos1 == 0. |
| if ((split_pos1 + 1 - begin) < (n >> 7) |
| || (end - split_pos1) < (n >> 7)) |
| { |
| // Very unequal split, one part smaller than one 128th |
| // elements not strictly larger than the pivot. |
| __gnu_parallel::unary_negate<__gnu_parallel::binder1st |
| <Comparator, value_type, value_type, bool>, value_type> |
| pred(__gnu_parallel::binder1st |
| <Comparator, value_type, value_type, bool>(comp, |
| *pivot_pos)); |
| |
| // Find other end of pivot-equal range. |
| split_pos2 = __gnu_sequential::partition(split_pos1 + 1, |
| end, pred); |
| } |
| else |
| // Only skip the pivot. |
| split_pos2 = split_pos1 + 1; |
| |
| // Elements equal to pivot are done. |
| elements_done += (split_pos2 - split_pos1); |
| #if _GLIBCXX_ASSERTIONS |
| total_elements_done += (split_pos2 - split_pos1); |
| #endif |
| // Always push larger part onto stack. |
| if (((split_pos1 + 1) - begin) < (end - (split_pos2))) |
| { |
| // Right side larger. |
| if ((split_pos2) != end) |
| tl.leftover_parts.push_front(std::make_pair(split_pos2, |
| end)); |
| |
| //current.first = begin; //already set anyway |
| current.second = split_pos1; |
| continue; |
| } |
| else |
| { |
| // Left side larger. |
| if (begin != split_pos1) |
| tl.leftover_parts.push_front(std::make_pair(begin, |
| split_pos1)); |
| |
| current.first = split_pos2; |
| //current.second = end; //already set anyway |
| continue; |
| } |
| } |
| else |
| { |
| __gnu_sequential::sort(begin, end, comp); |
| elements_done += n; |
| #if _GLIBCXX_ASSERTIONS |
| total_elements_done += n; |
| #endif |
| |
| // Prefer own stack, small pieces. |
| if (tl.leftover_parts.pop_front(current)) |
| continue; |
| |
| # pragma omp atomic |
| *tl.elements_leftover -= elements_done; |
| |
| elements_done = 0; |
| |
| #if _GLIBCXX_ASSERTIONS |
| double search_start = omp_get_wtime(); |
| #endif |
| |
| // Look for new work. |
| bool successfully_stolen = false; |
| while (wait && *tl.elements_leftover > 0 && !successfully_stolen |
| #if _GLIBCXX_ASSERTIONS |
| // Possible dead-lock. |
| && (omp_get_wtime() < (search_start + 1.0)) |
| #endif |
| ) |
| { |
| thread_index_t victim; |
| victim = rng(num_threads); |
| |
| // Large pieces. |
| successfully_stolen = (victim != iam) |
| && tls[victim]->leftover_parts.pop_back(current); |
| if (!successfully_stolen) |
| yield(); |
| #if !defined(__ICC) && !defined(__ECC) |
| # pragma omp flush |
| #endif |
| } |
| |
| #if _GLIBCXX_ASSERTIONS |
| if (omp_get_wtime() >= (search_start + 1.0)) |
| { |
| sleep(1); |
| _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() |
| < (search_start + 1.0)); |
| } |
| #endif |
| if (!successfully_stolen) |
| { |
| #if _GLIBCXX_ASSERTIONS |
| _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); |
| #endif |
| return; |
| } |
| } |
| } |
| } |
| |
| /** @brief Top-level quicksort routine. |
| * @param begin Begin iterator of sequence. |
| * @param end End iterator of sequence. |
| * @param comp Comparator. |
| * @param num_threads Number of threads that are allowed to work on |
| * this part. |
| */ |
| template<typename RandomAccessIterator, typename Comparator> |
| void |
| parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, |
| Comparator comp, |
| thread_index_t num_threads) |
| { |
| _GLIBCXX_CALL(end - begin) |
| |
| typedef std::iterator_traits<RandomAccessIterator> traits_type; |
| typedef typename traits_type::value_type value_type; |
| typedef typename traits_type::difference_type difference_type; |
| typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; |
| |
| typedef QSBThreadLocal<RandomAccessIterator> tls_type; |
| |
| difference_type n = end - begin; |
| |
| if (n <= 1) |
| return; |
| |
| // At least one element per processor. |
| if (num_threads > n) |
| num_threads = static_cast<thread_index_t>(n); |
| |
| // Initialize thread local storage |
| tls_type** tls = new tls_type*[num_threads]; |
| difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1); |
| for (thread_index_t t = 0; t < num_threads; ++t) |
| tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size); |
| |
| // There can never be more than ceil(log2(n)) ranges on the stack, because |
| // 1. Only one processor pushes onto the stack |
| // 2. The largest range has at most length n |
| // 3. Each range is larger than half of the range remaining |
| volatile difference_type elements_leftover = n; |
| for (int i = 0; i < num_threads; ++i) |
| { |
| tls[i]->elements_leftover = &elements_leftover; |
| tls[i]->num_threads = num_threads; |
| tls[i]->global = std::make_pair(begin, end); |
| |
| // Just in case nothing is left to assign. |
| tls[i]->initial = std::make_pair(end, end); |
| } |
| |
| // Main recursion call. |
| qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true); |
| |
| #if _GLIBCXX_ASSERTIONS |
| // All stack must be empty. |
| Piece dummy; |
| for (int i = 1; i < num_threads; ++i) |
| _GLIBCXX_PARALLEL_ASSERT(!tls[i]->leftover_parts.pop_back(dummy)); |
| #endif |
| |
| for (int i = 0; i < num_threads; ++i) |
| delete tls[i]; |
| delete[] tls; |
| } |
| } // namespace __gnu_parallel |
| |
| #endif /* _GLIBCXX_PARALLEL_BALANCED_QUICKSORT_H */ |