src/hotspot/share/gc/shenandoah/shenandoahTaskqueue.hpp - platform/libcore - Git at Google

 /*
  * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
  *
  * This code is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 only, as
  * published by the Free Software Foundation.
  *
  * This code is distributed in the hope that it will be useful, but WITHOUT
  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  * version 2 for more details (a copy is included in the LICENSE file that
  * accompanied this code).
  *
  * You should have received a copy of the GNU General Public License version
  * 2 along with this work; if not, write to the Free Software Foundation,
  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  *
  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  * or visit www.oracle.com if you need additional information or have any
  * questions.
  *
  */

 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP

 #include "gc/shared/taskqueue.hpp"
 #include "gc/shared/taskqueue.inline.hpp"
 #include "gc/shenandoah/shenandoahPadding.hpp"
 #include "runtime/mutex.hpp"
 #include "runtime/thread.hpp"
 #include "utilities/debug.hpp"

 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
 {
 public:
   typedef OverflowTaskQueue<E, F, N> taskqueue_t;

   BufferedOverflowTaskQueue() : _buf_empty(true) {};

   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)

   // Push task t into the queue. Returns true on success.
   inline bool push(E t);

   // Attempt to pop from the queue. Returns true on success.
   inline bool pop(E &t);

   inline void clear();

   inline bool is_empty()        const {
     return _buf_empty && taskqueue_t::is_empty();
   }

 private:
   bool _buf_empty;
   E _elem;
 };

 // ShenandoahMarkTask
 //
 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
 // case. On the other hand, it should not block parallel array processing from efficiently dividing
 // the array work.
 //
 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
 // all possible arrays.
 //
 //    |---------oop---------|-pow-|--chunk---|
 //    0                    49     54        64
 //
 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
 //
 // This encoding gives a few interesting benefits:
 //
 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
 //
 //    |---------oop---------|00000|0000000000| // no chunk data
 //
 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
 //
 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
 //
 //    Observe that the union of these two intervals is:
 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
 //
 //    ...which is the original interval:
 //      [ (C-1)*2^P; C*2^P )
 //
 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 //
 // Encoding limitations caused by current bitscales mean:
 //    10 bits for chunk: max 1024 blocks per array
 //     5 bits for power: max 2^32 array
 //    49 bits for   oop: max 512 TB of addressable space
 //
 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 //
 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 // bits from the native pointer. It is useful to debug the optimized version.
 //

 #ifdef _MSC_VER
 #pragma warning(push)
 // warning C4522: multiple assignment operators specified
 #pragma warning( disable:4522 )
 #endif

 #ifdef _LP64
 #define SHENANDOAH_OPTIMIZED_MARKTASK 1
 #else
 #define SHENANDOAH_OPTIMIZED_MARKTASK 0
 #endif

 #if SHENANDOAH_OPTIMIZED_MARKTASK
 class ShenandoahMarkTask
 {
 private:
   // Everything is encoded into this field...
   uintptr_t _obj;

   // ...with these:
   static const uint8_t chunk_bits  = 10;
   static const uint8_t pow_bits    = 5;
   static const uint8_t oop_bits    = sizeof(uintptr_t)*8 - chunk_bits - pow_bits;

   static const uint8_t oop_shift   = 0;
   static const uint8_t pow_shift   = oop_bits;
   static const uint8_t chunk_shift = oop_bits + pow_bits;

   static const uintptr_t oop_extract_mask       = right_n_bits(oop_bits);
   static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits);

   static const int chunk_range_mask = right_n_bits(chunk_bits);
   static const int pow_range_mask   = right_n_bits(pow_bits);

   inline oop decode_oop(uintptr_t val) const {
     STATIC_ASSERT(oop_shift == 0);
     return cast_to_oop(val & oop_extract_mask);
   }

   inline bool decode_not_chunked(uintptr_t val) const {
     // No need to shift for a comparison to zero
     return (val & chunk_pow_extract_mask) == 0;
   }

   inline int decode_chunk(uintptr_t val) const {
     return (int) ((val >> chunk_shift) & chunk_range_mask);
   }

   inline int decode_pow(uintptr_t val) const {
     return (int) ((val >> pow_shift) & pow_range_mask);
   }

   inline uintptr_t encode_oop(oop obj) const {
     STATIC_ASSERT(oop_shift == 0);
     return cast_from_oop<uintptr_t>(obj);
   }

   inline uintptr_t encode_chunk(int chunk) const {
     return ((uintptr_t) chunk) << chunk_shift;
   }

   inline uintptr_t encode_pow(int pow) const {
     return ((uintptr_t) pow) << pow_shift;
   }

 public:
   ShenandoahMarkTask(oop o = NULL) {
     uintptr_t enc = encode_oop(o);
     assert(decode_oop(enc) == o,    "oop encoding should work: " PTR_FORMAT, p2i(o));
     assert(decode_not_chunked(enc), "task should not be chunked");
     _obj = enc;
   }

   ShenandoahMarkTask(oop o, int chunk, int pow) {
     uintptr_t enc_oop = encode_oop(o);
     uintptr_t enc_chunk = encode_chunk(chunk);
     uintptr_t enc_pow = encode_pow(pow);
     uintptr_t enc = enc_oop | enc_chunk | enc_pow;
     assert(decode_oop(enc) == o,       "oop encoding should work: " PTR_FORMAT, p2i(o));
     assert(decode_chunk(enc) == chunk, "chunk encoding should work: %d", chunk);
     assert(decode_pow(enc) == pow,     "pow encoding should work: %d", pow);
     assert(!decode_not_chunked(enc),   "task should be chunked");
     _obj = enc;
   }

   ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj) { }

   ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
     _obj = t._obj;
     return *this;
   }

   volatile ShenandoahMarkTask&
   operator =(const volatile ShenandoahMarkTask& t) volatile {
     (void) const_cast<uintptr_t &>(_obj = t._obj);
     return *this;
   }

 public:
   inline oop  obj()            const { return decode_oop(_obj);   }
   inline int  chunk()          const { return decode_chunk(_obj); }
   inline int  pow()            const { return decode_pow(_obj);   }

   inline bool is_not_chunked() const { return decode_not_chunked(_obj); }

   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.

   static uintptr_t max_addressable() {
     return nth_bit(oop_bits);
   }

   static int chunk_size() {
     return nth_bit(chunk_bits);
   }
 };
 #else
 class ShenandoahMarkTask
 {
 private:
   static const uint8_t chunk_bits  = 10;
   static const uint8_t pow_bits    = 5;

   static const int chunk_max       = nth_bit(chunk_bits) - 1;
   static const int pow_max         = nth_bit(pow_bits) - 1;

   oop _obj;
   int _chunk;
   int _pow;

 public:
   ShenandoahMarkTask(oop o = NULL, int chunk = 0, int pow = 0):
     _obj(o), _chunk(chunk), _pow(pow) {
     assert(0 <= chunk && chunk <= chunk_max, "chunk is in range: %d", chunk);
     assert(0 <= pow && pow <= pow_max, "pow is in range: %d", pow);
   }

   ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }

   ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
     _obj = t._obj;
     _chunk = t._chunk;
     _pow = t._pow;
     return *this;
   }

   volatile ShenandoahMarkTask&
   operator =(const volatile ShenandoahMarkTask& t) volatile {
     (void)const_cast<oop&>(_obj = t._obj);
     _chunk = t._chunk;
     _pow = t._pow;
     return *this;
   }

   inline oop obj()             const { return _obj; }
   inline int chunk()           const { return _chunk; }
   inline int pow()             const { return _pow; }
   inline bool is_not_chunked() const { return _chunk == 0; }

   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.

   static size_t max_addressable() {
     return sizeof(oop);
   }

   static int chunk_size() {
     return nth_bit(chunk_bits);
   }
 };
 #endif // SHENANDOAH_OPTIMIZED_MARKTASK

 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif

 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;

 template <class T, MEMFLAGS F>
 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 private:
   shenandoah_padding(0);
   volatile jint     _claimed_index;
   shenandoah_padding(1);

   debug_only(uint   _reserved;  )

 public:
   using GenericTaskQueueSet<T, F>::size;

 public:
   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
     debug_only(_reserved = 0; )
   }

   void clear_claimed() { _claimed_index = 0; }
   T*   claim_next();

   // reserve queues that not for parallel claiming
   void reserve(uint n) {
     assert(n <= size(), "Sanity");
     _claimed_index = (jint)n;
     debug_only(_reserved = n;)
   }

   debug_only(uint get_reserved() const { return (uint)_reserved; })
 };

 template <class T, MEMFLAGS F>
 T* ParallelClaimableQueueSet<T, F>::claim_next() {
   jint size = (jint)GenericTaskQueueSet<T, F>::size();

   if (_claimed_index >= size) {
     return NULL;
   }

   jint index = Atomic::add(1, &_claimed_index);

   if (index <= size) {
     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
   } else {
     return NULL;
   }
 }

 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
 public:
   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}

   bool is_empty();
   void clear();

 #if TASKQUEUE_STATS
   static void print_taskqueue_stats_hdr(outputStream* const st);
   void print_taskqueue_stats() const;
   void reset_taskqueue_stats();
 #endif // TASKQUEUE_STATS
 };

 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
 private:
   ShenandoahHeap* const _heap;
 public:
   ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
   virtual bool should_exit_termination();
 };

 /*
  * This is an enhanced implementation of Google's work stealing
  * protocol, which is described in the paper:
  * Understanding and improving JVM GC work stealing at the data center scale
  * (http://dl.acm.org/citation.cfm?id=2926706)
  *
  * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
  * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
  * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
  * for stealing and termination condition.
  */

 class ShenandoahTaskTerminator: public ParallelTaskTerminator {
 private:
   Monitor*    _blocker;
   Thread*     _spin_master;

 public:
   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
     ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
     _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false, Monitor::_safepoint_check_never);
   }

   ~ShenandoahTaskTerminator() {
     assert(_blocker != NULL, "Can not be NULL");
     delete _blocker;
   }

   bool offer_termination(ShenandoahTerminatorTerminator* terminator);
   bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }

 private:
   bool offer_termination(TerminatorTerminator* terminator) {
     ShouldNotReachHere();
     return false;
   }

 private:
   size_t tasks_in_queue_set() { return _queue_set->tasks(); }

   /*
    * Perform spin-master task.
    * return true if termination condition is detected
    * otherwise, return false
    */
   bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator);
 };

 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
	/*
	* Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
	*
	* This code is free software; you can redistribute it and/or modify it
	* under the terms of the GNU General Public License version 2 only, as
	* published by the Free Software Foundation.
	*
	* This code is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	* version 2 for more details (a copy is included in the LICENSE file that
	* accompanied this code).
	*
	* You should have received a copy of the GNU General Public License version
	* 2 along with this work; if not, write to the Free Software Foundation,
	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
	*
	* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
	* or visit www.oracle.com if you need additional information or have any
	* questions.
	*
	*/

	#ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
	#define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP

	#include "gc/shared/taskqueue.hpp"
	#include "gc/shared/taskqueue.inline.hpp"
	#include "gc/shenandoah/shenandoahPadding.hpp"
	#include "runtime/mutex.hpp"
	#include "runtime/thread.hpp"
	#include "utilities/debug.hpp"

	template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
	class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
	{
	public:
	typedef OverflowTaskQueue<E, F, N> taskqueue_t;

	BufferedOverflowTaskQueue() : _buf_empty(true) {};

	TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)

	// Push task t into the queue. Returns true on success.
	inline bool push(E t);

	// Attempt to pop from the queue. Returns true on success.
	inline bool pop(E &t);

	inline void clear();

	inline bool is_empty() const {
	return _buf_empty && taskqueue_t::is_empty();
	}

	private:
	bool _buf_empty;
	E _elem;
	};

	// ShenandoahMarkTask
	//
	// Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
	// The design goal is to make the regular oop ops very fast, because that would be the prevailing
	// case. On the other hand, it should not block parallel array processing from efficiently dividing
	// the array work.
	//
	// The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
	// proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
	// most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
	// that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
	// all possible arrays.
	//
	// \|---------oop---------\|-pow-\|--chunk---\|
	// 0 49 54 64
	//
	// By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
	//
	// This encoding gives a few interesting benefits:
	//
	// a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
	//
	// \|---------oop---------\|00000\|0000000000\| // no chunk data
	//
	// This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
	// with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
	//
	// b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
	// interval [ (C-1)2^P; C2^P ). We can then split it into two chunks:
	// <2C - 1, P-1>, that covers interval [ (2C - 2)2^(P-1); (2C - 1)*2^(P-1) )
	// <2C, P-1>, that covers interval [ (2C - 1)2^(P-1); 2C*2^(P-1) )
	//
	// Observe that the union of these two intervals is:
	// [ (2C - 2)2^(P-1); 2C2^(P-1) )
	//
	// ...which is the original interval:
	// [ (C-1)2^P; C2^P )
	//
	// c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
	// down in the parallel threads, which alleviates the upfront (serial) splitting costs.
	//
	// Encoding limitations caused by current bitscales mean:
	// 10 bits for chunk: max 1024 blocks per array
	// 5 bits for power: max 2^32 array
	// 49 bits for oop: max 512 TB of addressable space
	//
	// Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
	// potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
	// In future, these might be rebalanced to favor one degree of freedom against another. For example,
	// if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
	// some bits back if chunks are counted in ObjArrayMarkingStride units.
	//
	// There is also a fallback version that uses plain fields, when we don't have enough space to steal the
	// bits from the native pointer. It is useful to debug the optimized version.
	//

	#ifdef _MSC_VER
	#pragma warning(push)
	// warning C4522: multiple assignment operators specified
	#pragma warning( disable:4522 )
	#endif

	#ifdef _LP64
	#define SHENANDOAH_OPTIMIZED_MARKTASK 1
	#else
	#define SHENANDOAH_OPTIMIZED_MARKTASK 0
	#endif

	#if SHENANDOAH_OPTIMIZED_MARKTASK
	class ShenandoahMarkTask
	{
	private:
	// Everything is encoded into this field...
	uintptr_t _obj;

	// ...with these:
	static const uint8_t chunk_bits = 10;
	static const uint8_t pow_bits = 5;
	static const uint8_t oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits;

	static const uint8_t oop_shift = 0;
	static const uint8_t pow_shift = oop_bits;
	static const uint8_t chunk_shift = oop_bits + pow_bits;

	static const uintptr_t oop_extract_mask = right_n_bits(oop_bits);
	static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits);

	static const int chunk_range_mask = right_n_bits(chunk_bits);
	static const int pow_range_mask = right_n_bits(pow_bits);

	inline oop decode_oop(uintptr_t val) const {
	STATIC_ASSERT(oop_shift == 0);
	return cast_to_oop(val & oop_extract_mask);
	}

	inline bool decode_not_chunked(uintptr_t val) const {
	// No need to shift for a comparison to zero
	return (val & chunk_pow_extract_mask) == 0;
	}

	inline int decode_chunk(uintptr_t val) const {
	return (int) ((val >> chunk_shift) & chunk_range_mask);
	}

	inline int decode_pow(uintptr_t val) const {
	return (int) ((val >> pow_shift) & pow_range_mask);
	}

	inline uintptr_t encode_oop(oop obj) const {
	STATIC_ASSERT(oop_shift == 0);
	return cast_from_oop<uintptr_t>(obj);
	}

	inline uintptr_t encode_chunk(int chunk) const {
	return ((uintptr_t) chunk) << chunk_shift;
	}

	inline uintptr_t encode_pow(int pow) const {
	return ((uintptr_t) pow) << pow_shift;
	}

	public:
	ShenandoahMarkTask(oop o = NULL) {
	uintptr_t enc = encode_oop(o);
	assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o));
	assert(decode_not_chunked(enc), "task should not be chunked");
	_obj = enc;
	}

	ShenandoahMarkTask(oop o, int chunk, int pow) {
	uintptr_t enc_oop = encode_oop(o);
	uintptr_t enc_chunk = encode_chunk(chunk);
	uintptr_t enc_pow = encode_pow(pow);
	uintptr_t enc = enc_oop \| enc_chunk \| enc_pow;
	assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o));
	assert(decode_chunk(enc) == chunk, "chunk encoding should work: %d", chunk);
	assert(decode_pow(enc) == pow, "pow encoding should work: %d", pow);
	assert(!decode_not_chunked(enc), "task should be chunked");
	_obj = enc;
	}

	ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj) { }

	ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
	_obj = t._obj;
	return *this;
	}

	volatile ShenandoahMarkTask&
	operator =(const volatile ShenandoahMarkTask& t) volatile {
	(void) const_cast<uintptr_t &>(_obj = t._obj);
	return *this;
	}

	public:
	inline oop obj() const { return decode_oop(_obj); }
	inline int chunk() const { return decode_chunk(_obj); }
	inline int pow() const { return decode_pow(_obj); }

	inline bool is_not_chunked() const { return decode_not_chunked(_obj); }

	DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.

	static uintptr_t max_addressable() {
	return nth_bit(oop_bits);
	}

	static int chunk_size() {
	return nth_bit(chunk_bits);
	}
	};
	#else
	class ShenandoahMarkTask
	{
	private:
	static const uint8_t chunk_bits = 10;
	static const uint8_t pow_bits = 5;

	static const int chunk_max = nth_bit(chunk_bits) - 1;
	static const int pow_max = nth_bit(pow_bits) - 1;

	oop _obj;
	int _chunk;
	int _pow;

	public:
	ShenandoahMarkTask(oop o = NULL, int chunk = 0, int pow = 0):
	_obj(o), _chunk(chunk), _pow(pow) {
	assert(0 <= chunk && chunk <= chunk_max, "chunk is in range: %d", chunk);
	assert(0 <= pow && pow <= pow_max, "pow is in range: %d", pow);
	}

	ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }

	ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
	_obj = t._obj;
	_chunk = t._chunk;
	_pow = t._pow;
	return *this;
	}

	volatile ShenandoahMarkTask&
	operator =(const volatile ShenandoahMarkTask& t) volatile {
	(void)const_cast<oop&>(_obj = t._obj);
	_chunk = t._chunk;
	_pow = t._pow;
	return *this;
	}

	inline oop obj() const { return _obj; }
	inline int chunk() const { return _chunk; }
	inline int pow() const { return _pow; }
	inline bool is_not_chunked() const { return _chunk == 0; }

	DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.

	static size_t max_addressable() {
	return sizeof(oop);
	}

	static int chunk_size() {
	return nth_bit(chunk_bits);
	}
	};
	#endif // SHENANDOAH_OPTIMIZED_MARKTASK

	#ifdef _MSC_VER
	#pragma warning(pop)
	#endif

	typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
	typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;

	template <class T, MEMFLAGS F>
	class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
	private:
	shenandoah_padding(0);
	volatile jint _claimed_index;
	shenandoah_padding(1);

	debug_only(uint _reserved; )

	public:
	using GenericTaskQueueSet<T, F>::size;

	public:
	ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
	debug_only(_reserved = 0; )
	}

	void clear_claimed() { _claimed_index = 0; }
	T* claim_next();

	// reserve queues that not for parallel claiming
	void reserve(uint n) {
	assert(n <= size(), "Sanity");
	_claimed_index = (jint)n;
	debug_only(_reserved = n;)
	}

	debug_only(uint get_reserved() const { return (uint)_reserved; })
	};

	template <class T, MEMFLAGS F>
	T* ParallelClaimableQueueSet<T, F>::claim_next() {
	jint size = (jint)GenericTaskQueueSet<T, F>::size();

	if (_claimed_index >= size) {
	return NULL;
	}

	jint index = Atomic::add(1, &_claimed_index);

	if (index <= size) {
	return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
	} else {
	return NULL;
	}
	}

	class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
	public:
	ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}

	bool is_empty();
	void clear();

	#if TASKQUEUE_STATS
	static void print_taskqueue_stats_hdr(outputStream* const st);
	void print_taskqueue_stats() const;
	void reset_taskqueue_stats();
	#endif // TASKQUEUE_STATS
	};

	class ShenandoahTerminatorTerminator : public TerminatorTerminator {
	private:
	ShenandoahHeap* const _heap;
	public:
	ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
	virtual bool should_exit_termination();
	};

	/*
	* This is an enhanced implementation of Google's work stealing
	* protocol, which is described in the paper:
	* Understanding and improving JVM GC work stealing at the data center scale
	* (http://dl.acm.org/citation.cfm?id=2926706)
	*
	* Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
	* the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
	* The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
	* for stealing and termination condition.
	*/

	class ShenandoahTaskTerminator: public ParallelTaskTerminator {
	private:
	Monitor* _blocker;
	Thread* _spin_master;

	public:
	ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
	ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
	_blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false, Monitor::_safepoint_check_never);
	}

	~ShenandoahTaskTerminator() {
	assert(_blocker != NULL, "Can not be NULL");
	delete _blocker;
	}

	bool offer_termination(ShenandoahTerminatorTerminator* terminator);
	bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }

	private:
	bool offer_termination(TerminatorTerminator* terminator) {
	ShouldNotReachHere();
	return false;
	}

	private:
	size_t tasks_in_queue_set() { return _queue_set->tasks(); }

	/*
	* Perform spin-master task.
	* return true if termination condition is detected
	* otherwise, return false
	*/
	bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator);
	};

	#endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP