src/include/fst/extensions/ngram/bitmap-index.h - platform/external/openfst - Git at Google


 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Copyright 2005-2010 Google, Inc.
 // Author: sorenj@google.com (Jeffrey Sorensen)

 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_

 #include <vector>
 using std::vector;

 #include <fst/compat.h>

 // This class is a bitstring storage class with an index that allows
 // seeking to the Nth set or clear bit in time O(Log(N)) where N is
 // the length of the bit vector.  In addition, it allows counting set or
 // clear bits over ranges in constant time.
 //
 // This is accomplished by maintaining an "secondary" index of limited
 // size in bits that maintains a running count of the number of bits set
 // in each block of bitmap data.  A block is defined as the number of
 // uint64 values that can fit in the secondary index before an overflow
 // occurs.
 //
 // To handle overflows, a "primary" index containing a running count of
 // bits set in each block is created using the type uint64.

 namespace fst {

 class BitmapIndex {
  public:
   static size_t StorageSize(size_t size) {
     return ((size + kStorageBlockMask) >> kStorageLogBitSize);
   }

   BitmapIndex() : bits_(NULL), size_(0) { }

   bool Get(size_t index) const {
     return (bits_[index >> kStorageLogBitSize] &
             (kOne << (index & kStorageBlockMask))) != 0;
   }

   static void Set(uint64* bits, size_t index) {
     bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
   }

   static void Clear(uint64* bits, size_t index) {
     bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
   }

   size_t Bits() const {
     return size_;
   }

   size_t ArraySize() const {
     return StorageSize(size_);
   }

   // Returns the number of one bits in the bitmap
   size_t GetOnesCount() const {
     return primary_index_[primary_index_size() - 1];
   }

   // Returns the number of one bits in positions 0 to limit - 1.
   // REQUIRES: limit <= Bits()
   size_t Rank1(size_t end) const;

   // Returns the number of one bits in the range start to end - 1.
   // REQUIRES: limit <= Bits()
   size_t GetOnesCountInRange(size_t start, size_t end) const {
     return Rank1(end) - Rank1(start);
   }

   // Returns the number of zero bits in positions 0 to limit - 1.
   // REQUIRES: limit <= Bits()
   size_t Rank0(size_t end) const {
     return end - Rank1(end);
   }

   // Returns the number of zero bits in the range start to end - 1.
   // REQUIRES: limit <= Bits()
   size_t GetZeroesCountInRange(size_t start, size_t end) const {
     return end - start - GetOnesCountInRange(start, end);
   }

   // Return true if any bit between begin inclusive and end exclusive
   // is set.  0 <= begin <= end <= Bits() is required.
   //
   bool TestRange(size_t start, size_t end) const {
     return Rank1(end) > Rank1(start);
   }

   // Returns the offset to the nth set bit (zero based)
   // or Bits() if index >= number of ones
   size_t Select1(size_t bit_index) const;

   // Returns the offset to the nth clear bit (zero based)
   // or Bits() if index > number of
   size_t Select0(size_t bit_index) const;

   // Rebuilds from index for the associated Bitmap, should be called
   // whenever changes have been made to the Bitmap or else behavior
   // of the indexed bitmap methods will be undefined.
   void BuildIndex(const uint64 *bits, size_t size);

   // the secondary index accumulates counts until it can possibly overflow
   // this constant computes the number of uint64 units that can fit into
   // units the size of uint16.
   static const uint64 kOne = 1;
   static const uint32 kStorageBitSize = 64;
   static const uint32 kStorageLogBitSize = 6;
   static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
       >> kStorageLogBitSize;

  private:
   static const uint32 kStorageBlockMask = kStorageBitSize - 1;

   // returns, from the index, the count of ones up to array_index
   size_t get_index_ones_count(size_t array_index) const;

   // because the indexes, both primary and secondary, contain a running
   // count of the population of one bits contained in [0,i), there is
   // no reason to have an element in the zeroth position as this value would
   // necessarily be zero.  (The bits are indexed in a zero based way.)  Thus
   // we don't store the 0th element in either index.  Both of the following
   // functions, if greater than 0, must be decremented by one before retreiving
   // the value from the corresponding array.
   // returns the 1 + the block that contains the bitindex in question
   // the inverted version works the same but looks for zeros using an inverted
   // view of the index
   size_t find_primary_block(size_t bit_index) const;

   size_t find_inverted_primary_block(size_t bit_index) const;

   // similarly, the secondary index (which resets its count to zero at
   // the end of every kSecondaryBlockSize entries) does not store the element
   // at 0.  Note that the rem_bit_index parameter is the number of bits
   // within the secondary block, after the bits accounted for by the primary
   // block have been removed (i.e. the remaining bits)  And, because we
   // reset to zero with each new block, there is no need to store those
   // actual zeros.
   // returns 1 + the secondary block that contains the bitindex in question
   size_t find_secondary_block(size_t block, size_t rem_bit_index) const;

   size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
       const;

   // We create a primary index based upon the number of secondary index
   // blocks.  The primary index uses fields wide enough to accomodate any
   // index of the bitarray so cannot overflow
   // The primary index is the actual running
   // count of one bits set for all blocks (and, thus, all uint64s).
   size_t primary_index_size() const {
     return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
   }

   const uint64* bits_;
   size_t size_;

   // The primary index contains the running popcount of all blocks
   // which means the nth value contains the popcounts of
   // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
   vector<uint32> primary_index_;
   // The secondary index contains the running popcount of the associated
   // bitmap.  It is the same length (in units of uint16) as the
   // bitmap's map is in units of uint64s.
   vector<uint16> secondary_index_;
 };

 }  // end namespace fst

 #endif  // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_

	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	// Copyright 2005-2010 Google, Inc.
	// Author: sorenj@google.com (Jeffrey Sorensen)

	#ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
	#define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_

	#include <vector>
	using std::vector;

	#include <fst/compat.h>

	// This class is a bitstring storage class with an index that allows
	// seeking to the Nth set or clear bit in time O(Log(N)) where N is
	// the length of the bit vector. In addition, it allows counting set or
	// clear bits over ranges in constant time.
	//
	// This is accomplished by maintaining an "secondary" index of limited
	// size in bits that maintains a running count of the number of bits set
	// in each block of bitmap data. A block is defined as the number of
	// uint64 values that can fit in the secondary index before an overflow
	// occurs.
	//
	// To handle overflows, a "primary" index containing a running count of
	// bits set in each block is created using the type uint64.

	namespace fst {

	class BitmapIndex {
	public:
	static size_t StorageSize(size_t size) {
	return ((size + kStorageBlockMask) >> kStorageLogBitSize);
	}

	BitmapIndex() : bits_(NULL), size_(0) { }

	bool Get(size_t index) const {
	return (bits_[index >> kStorageLogBitSize] &
	(kOne << (index & kStorageBlockMask))) != 0;
	}

	static void Set(uint64* bits, size_t index) {
	bits[index >> kStorageLogBitSize] \|= (kOne << (index & kStorageBlockMask));
	}

	static void Clear(uint64* bits, size_t index) {
	bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
	}

	size_t Bits() const {
	return size_;
	}

	size_t ArraySize() const {
	return StorageSize(size_);
	}

	// Returns the number of one bits in the bitmap
	size_t GetOnesCount() const {
	return primary_index_[primary_index_size() - 1];
	}

	// Returns the number of one bits in positions 0 to limit - 1.
	// REQUIRES: limit <= Bits()
	size_t Rank1(size_t end) const;

	// Returns the number of one bits in the range start to end - 1.
	// REQUIRES: limit <= Bits()
	size_t GetOnesCountInRange(size_t start, size_t end) const {
	return Rank1(end) - Rank1(start);
	}

	// Returns the number of zero bits in positions 0 to limit - 1.
	// REQUIRES: limit <= Bits()
	size_t Rank0(size_t end) const {
	return end - Rank1(end);
	}

	// Returns the number of zero bits in the range start to end - 1.
	// REQUIRES: limit <= Bits()
	size_t GetZeroesCountInRange(size_t start, size_t end) const {
	return end - start - GetOnesCountInRange(start, end);
	}

	// Return true if any bit between begin inclusive and end exclusive
	// is set. 0 <= begin <= end <= Bits() is required.
	//
	bool TestRange(size_t start, size_t end) const {
	return Rank1(end) > Rank1(start);
	}

	// Returns the offset to the nth set bit (zero based)
	// or Bits() if index >= number of ones
	size_t Select1(size_t bit_index) const;

	// Returns the offset to the nth clear bit (zero based)
	// or Bits() if index > number of
	size_t Select0(size_t bit_index) const;

	// Rebuilds from index for the associated Bitmap, should be called
	// whenever changes have been made to the Bitmap or else behavior
	// of the indexed bitmap methods will be undefined.
	void BuildIndex(const uint64 *bits, size_t size);

	// the secondary index accumulates counts until it can possibly overflow
	// this constant computes the number of uint64 units that can fit into
	// units the size of uint16.
	static const uint64 kOne = 1;
	static const uint32 kStorageBitSize = 64;
	static const uint32 kStorageLogBitSize = 6;
	static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
	>> kStorageLogBitSize;

	private:
	static const uint32 kStorageBlockMask = kStorageBitSize - 1;

	// returns, from the index, the count of ones up to array_index
	size_t get_index_ones_count(size_t array_index) const;

	// because the indexes, both primary and secondary, contain a running
	// count of the population of one bits contained in [0,i), there is
	// no reason to have an element in the zeroth position as this value would
	// necessarily be zero. (The bits are indexed in a zero based way.) Thus
	// we don't store the 0th element in either index. Both of the following
	// functions, if greater than 0, must be decremented by one before retreiving
	// the value from the corresponding array.
	// returns the 1 + the block that contains the bitindex in question
	// the inverted version works the same but looks for zeros using an inverted
	// view of the index
	size_t find_primary_block(size_t bit_index) const;

	size_t find_inverted_primary_block(size_t bit_index) const;

	// similarly, the secondary index (which resets its count to zero at
	// the end of every kSecondaryBlockSize entries) does not store the element
	// at 0. Note that the rem_bit_index parameter is the number of bits
	// within the secondary block, after the bits accounted for by the primary
	// block have been removed (i.e. the remaining bits) And, because we
	// reset to zero with each new block, there is no need to store those
	// actual zeros.
	// returns 1 + the secondary block that contains the bitindex in question
	size_t find_secondary_block(size_t block, size_t rem_bit_index) const;

	size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
	const;

	// We create a primary index based upon the number of secondary index
	// blocks. The primary index uses fields wide enough to accomodate any
	// index of the bitarray so cannot overflow
	// The primary index is the actual running
	// count of one bits set for all blocks (and, thus, all uint64s).
	size_t primary_index_size() const {
	return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
	}

	const uint64* bits_;
	size_t size_;

	// The primary index contains the running popcount of all blocks
	// which means the nth value contains the popcounts of
	// [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
	vector<uint32> primary_index_;
	// The secondary index contains the running popcount of the associated
	// bitmap. It is the same length (in units of uint16) as the
	// bitmap's map is in units of uint64s.
	vector<uint16> secondary_index_;
	};

	} // end namespace fst

	#endif // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_