| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Copyright 2005-2010 Google, Inc. |
| // Author: Jeffrey Soresnen (sorenj@google.com) |
| |
| #include <fst/extensions/ngram/bitmap-index.h> |
| |
| #include <algorithm> |
| #include <iterator> |
| |
| #include <fst/extensions/ngram/nthbit.h> |
| |
| namespace fst { |
| |
| // These two internal classes implemented inverted views of the |
| // primary and secondary indexes. That is, they provide iterators |
| // that have operator*'s that return the number zeros rather than |
| // the number of ones. |
| |
| class primary_index_inverted : public vector<uint32>::const_iterator { |
| public: |
| primary_index_inverted() {} |
| primary_index_inverted(vector<uint32>::const_iterator loc, |
| vector<uint32>::const_iterator begin) : |
| vector<uint32>::const_iterator(loc), begin_(begin) {} |
| uint32 operator*() { |
| return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize * |
| (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) - |
| vector<uint32>::const_iterator::operator*(); |
| } |
| private: |
| vector<uint32>::const_iterator begin_; |
| }; |
| |
| class secondary_index_inverted : public vector<uint16>::const_iterator { |
| public: |
| secondary_index_inverted() : vector<uint16>::const_iterator() {} |
| secondary_index_inverted(vector<uint16>::const_iterator loc, |
| vector<uint16>::const_iterator block_begin) : |
| vector<uint16>::const_iterator(loc), block_begin_(block_begin) {} |
| uint16 operator*() { |
| return ((1 + std::distance<vector<uint16>::const_iterator>( |
| block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) - |
| vector<uint16>::const_iterator::operator*(); |
| } |
| private: |
| vector<uint16>::const_iterator block_begin_; |
| }; |
| |
| size_t BitmapIndex::Rank1(size_t end) const { |
| if (end == 0) return 0; |
| CHECK_LE(end, Bits()); |
| const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize; |
| const uint32 sum = get_index_ones_count(end_word); |
| const uint64 zero = 0; |
| const uint64 ones = ~zero; |
| return sum + __builtin_popcountll(bits_[end_word] & |
| (ones >> (kStorageBitSize - (end & kStorageBlockMask)))); |
| } |
| |
| size_t BitmapIndex::Select1(size_t bit_index) const { |
| if (bit_index >= GetOnesCount()) return Bits(); |
| // search primary index for the relevant block |
| uint32 rembits = bit_index + 1; |
| const uint32 block = find_primary_block(bit_index + 1); |
| uint32 offset = 0; |
| if (block > 0) { |
| rembits -= primary_index_[block - 1]; |
| offset += block * kSecondaryBlockSize; |
| } |
| // search the secondary index |
| uint32 word = find_secondary_block(offset, rembits); |
| if (word > 0) { |
| rembits -= secondary_index_[offset + word - 1]; |
| offset += word; |
| } |
| int nth = nth_bit(bits_[offset], rembits); |
| return (offset << BitmapIndex::kStorageLogBitSize) + nth; |
| } |
| |
| size_t BitmapIndex::Select0(size_t bit_index) const { |
| if (bit_index >= Bits() - GetOnesCount()) return Bits(); |
| // search inverted primary index for relevant block |
| uint32 remzeros = bit_index + 1; |
| uint32 offset = 0; |
| const uint32 block = find_inverted_primary_block(bit_index + 1); |
| if (block > 0) { |
| remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1, |
| primary_index_.begin()); |
| offset += block * kSecondaryBlockSize; |
| } |
| // search the inverted secondary index |
| uint32 word = find_inverted_secondary_block(offset, remzeros); |
| if (word > 0) { |
| vector<uint16>::const_iterator block_begin = |
| secondary_index_.begin() + offset; |
| remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin); |
| offset += word; |
| } |
| int nth = nth_bit(~bits_[offset], remzeros); |
| return (offset << BitmapIndex::kStorageLogBitSize) + nth; |
| } |
| |
| size_t BitmapIndex::get_index_ones_count(size_t array_index) const { |
| uint32 sum = 0; |
| if (array_index > 0) { |
| sum += secondary_index_[array_index-1]; |
| uint32 end_block = (array_index - 1) / kSecondaryBlockSize; |
| if (end_block > 0) sum += primary_index_[end_block-1]; |
| } |
| return sum; |
| } |
| |
| void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) { |
| bits_ = bits; |
| size_ = size; |
| secondary_index_.clear(); |
| secondary_index_.reserve(ArraySize()); |
| primary_index_.clear(); |
| primary_index_.reserve(primary_index_size()); |
| const uint64 zero = 0; |
| const uint64 ones = ~zero; |
| uint32 popcount = 0; |
| for (uint32 block_begin = 0; block_begin < ArraySize(); |
| block_begin += kSecondaryBlockSize) { |
| uint32 block_popcount = 0; |
| uint32 block_end = block_begin + kSecondaryBlockSize; |
| if (block_end > ArraySize()) block_end = ArraySize(); |
| for (uint32 j = block_begin; j < block_end; ++j) { |
| uint64 mask = ones; |
| if (j == ArraySize() - 1) { |
| mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask); |
| } |
| block_popcount += __builtin_popcountll(bits_[j] & mask); |
| secondary_index_.push_back(block_popcount); |
| } |
| popcount += block_popcount; |
| primary_index_.push_back(popcount); |
| } |
| } |
| |
| size_t BitmapIndex::find_secondary_block( |
| size_t block_begin, size_t rem_bit_index) const { |
| size_t block_end = block_begin + kSecondaryBlockSize; |
| if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); |
| return std::distance(secondary_index_.begin() + block_begin, |
| std::lower_bound(secondary_index_.begin() + block_begin, |
| secondary_index_.begin() + block_end, |
| rem_bit_index)); |
| } |
| |
| size_t BitmapIndex::find_inverted_secondary_block( |
| size_t block_begin, size_t rem_bit_index) const { |
| size_t block_end = block_begin + kSecondaryBlockSize; |
| if (block_end > secondary_index_.size()) block_end = secondary_index_.size(); |
| secondary_index_inverted start(secondary_index_.begin() + block_begin, |
| secondary_index_.begin() + block_begin); |
| secondary_index_inverted end(secondary_index_.begin() + block_end, |
| secondary_index_.begin() + block_begin); |
| return std::distance(start, |
| std::lower_bound(start, end, rem_bit_index)); |
| } |
| |
| inline size_t BitmapIndex::find_primary_block(size_t bit_index) const { |
| return std::distance(primary_index_.begin(), |
| std::lower_bound(primary_index_.begin(), |
| primary_index_.end(), bit_index)); |
| } |
| |
| size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const { |
| primary_index_inverted start(primary_index_.begin(), primary_index_.begin()); |
| primary_index_inverted end(primary_index_.end(), primary_index_.begin()); |
| return std::distance(start, std::lower_bound(start, end, bit_index)); |
| } |
| |
| } // end namespace fst |