| // Copyright (C) 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "icing/index/lite-index.h" |
| |
| #include <inttypes.h> |
| #include <stddef.h> |
| #include <stdint.h> |
| #include <sys/mman.h> |
| |
| #include <algorithm> |
| #include <cstdint> |
| #include <memory> |
| #include <string> |
| #include <string_view> |
| #include <utility> |
| #include <vector> |
| |
| #include "icing/text_classifier/lib3/utils/base/status.h" |
| #include "icing/text_classifier/lib3/utils/base/statusor.h" |
| #include "icing/absl_ports/canonical_errors.h" |
| #include "icing/absl_ports/str_cat.h" |
| #include "icing/file/filesystem.h" |
| #include "icing/index/hit/doc-hit-info.h" |
| #include "icing/index/hit/hit.h" |
| #include "icing/legacy/core/icing-string-util.h" |
| #include "icing/legacy/core/icing-timer.h" |
| #include "icing/legacy/index/icing-array-storage.h" |
| #include "icing/legacy/index/icing-dynamic-trie.h" |
| #include "icing/legacy/index/icing-filesystem.h" |
| #include "icing/legacy/index/icing-lite-index-header.h" |
| #include "icing/legacy/index/icing-mmapper.h" |
| #include "icing/proto/term.pb.h" |
| #include "icing/schema/section.h" |
| #include "icing/store/document-id.h" |
| #include "icing/util/crc32.h" |
| #include "icing/util/logging.h" |
| #include "icing/util/status-macros.h" |
| |
| namespace icing { |
| namespace lib { |
| |
| namespace { |
| |
| // Point at which we declare the trie full. |
| constexpr double kTrieFullFraction = 0.95; |
| |
| std::string MakeHitBufferFilename(const std::string& filename_base) { |
| return filename_base + "hb"; |
| } |
| |
| size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); } |
| |
| } // namespace |
| |
| const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue = |
| LiteIndex::Element(0, Hit()).value(); |
| |
| libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create( |
| const LiteIndex::Options& options, const IcingFilesystem* filesystem) { |
| ICING_RETURN_ERROR_IF_NULL(filesystem); |
| |
| std::unique_ptr<LiteIndex> lite_index = |
| std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem)); |
| ICING_RETURN_IF_ERROR(lite_index->Initialize()); |
| return std::move(lite_index); |
| } |
| |
| // size is max size in elements. An appropriate lexicon and display |
| // mapping size will be chosen based on hit buffer size. |
| LiteIndex::LiteIndex(const LiteIndex::Options& options, |
| const IcingFilesystem* filesystem) |
| : hit_buffer_(*filesystem), |
| hit_buffer_crc_(0), |
| lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(), |
| filesystem), |
| header_mmap_(false, MAP_SHARED), |
| options_(options), |
| filesystem_(filesystem) {} |
| |
| LiteIndex::~LiteIndex() { |
| if (initialized()) { |
| libtextclassifier3::Status unused = PersistToDisk(); |
| } |
| } |
| |
| IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() { |
| return IcingDynamicTrie::RuntimeOptions().set_storage_policy( |
| IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc); |
| } |
| |
| libtextclassifier3::Status LiteIndex::Initialize() { |
| // Size of hit buffer's header struct, rounded up to the nearest number of |
| // system memory pages. |
| const size_t header_padded_size = |
| IcingMMapper::page_aligned_size(header_size()); |
| |
| // Variable declarations cannot cross goto jumps, so declare these up top. |
| libtextclassifier3::Status status; |
| uint64_t file_size; |
| IcingTimer timer; |
| |
| if (!lexicon_.CreateIfNotExist(options_.lexicon_options) || |
| !lexicon_.Init()) { |
| return absl_ports::InternalError("Failed to initialize lexicon trie"); |
| } |
| |
| hit_buffer_fd_.reset(filesystem_->OpenForWrite( |
| MakeHitBufferFilename(options_.filename_base).c_str())); |
| if (!hit_buffer_fd_.is_valid()) { |
| status = absl_ports::InternalError("Failed to open hit buffer file"); |
| goto error; |
| } |
| |
| file_size = filesystem_->GetFileSize(hit_buffer_fd_.get()); |
| if (file_size == IcingFilesystem::kBadFileSize) { |
| status = absl_ports::InternalError("Failed to query hit buffer file size"); |
| goto error; |
| } |
| |
| if (file_size < header_padded_size) { |
| if (file_size != 0) { |
| status = absl_ports::InternalError(IcingStringUtil::StringPrintf( |
| "Hit buffer had unexpected size %" PRIu64, file_size)); |
| goto error; |
| } |
| |
| ICING_VLOG(2) << "Creating new hit buffer"; |
| // Make sure files are fresh. |
| if (!lexicon_.Remove() || |
| !lexicon_.CreateIfNotExist(options_.lexicon_options) || |
| !lexicon_.Init()) { |
| status = |
| absl_ports::InternalError("Failed to refresh lexicon during clear"); |
| goto error; |
| } |
| |
| // Create fresh hit buffer by first emptying the hit buffer file and then |
| // allocating header_padded_size of the cleared space. |
| if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) || |
| !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) { |
| status = absl_ports::InternalError("Failed to truncate hit buffer file"); |
| goto error; |
| } |
| |
| // Set up header. |
| header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); |
| header_ = std::make_unique<IcingLiteIndex_HeaderImpl>( |
| reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>( |
| header_mmap_.address())); |
| header_->Reset(); |
| |
| if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, |
| sizeof(Element::Value), header_->cur_size(), |
| options_.hit_buffer_size, &hit_buffer_crc_, true)) { |
| status = absl_ports::InternalError("Failed to initialize new hit buffer"); |
| goto error; |
| } |
| |
| UpdateChecksum(); |
| } else { |
| header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size()); |
| header_ = std::make_unique<IcingLiteIndex_HeaderImpl>( |
| reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>( |
| header_mmap_.address())); |
| |
| if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true, |
| sizeof(Element::Value), header_->cur_size(), |
| options_.hit_buffer_size, &hit_buffer_crc_, true)) { |
| status = absl_ports::InternalError( |
| "Failed to re-initialize existing hit buffer"); |
| goto error; |
| } |
| |
| // Check integrity. |
| if (!header_->check_magic()) { |
| status = absl_ports::InternalError("Lite index header magic mismatch"); |
| goto error; |
| } |
| Crc32 crc = ComputeChecksum(); |
| if (crc.Get() != header_->lite_index_crc()) { |
| status = absl_ports::DataLossError( |
| IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u", |
| crc.Get(), header_->lite_index_crc())); |
| goto error; |
| } |
| } |
| |
| ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index init ok in %.3fms", |
| timer.Elapsed() * 1000); |
| return status; |
| |
| error: |
| header_ = nullptr; |
| header_mmap_.Unmap(); |
| lexicon_.Close(); |
| hit_buffer_crc_ = 0; |
| hit_buffer_.Reset(); |
| hit_buffer_fd_.reset(); |
| if (status.ok()) { |
| return absl_ports::InternalError( |
| "Error handling code ran but status was ok"); |
| } |
| return status; |
| } |
| |
| Crc32 LiteIndex::ComputeChecksum() { |
| IcingTimer timer; |
| |
| // Update crcs. |
| uint32_t dependent_crcs[2]; |
| hit_buffer_.UpdateCrc(); |
| dependent_crcs[0] = hit_buffer_crc_; |
| dependent_crcs[1] = lexicon_.UpdateCrc(); |
| |
| // Compute the master crc. |
| |
| // Header crc, excluding the actual crc field. |
| Crc32 all_crc(header_->CalculateHeaderCrc()); |
| all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs), |
| sizeof(dependent_crcs))); |
| ICING_VLOG(2) << IcingStringUtil::StringPrintf( |
| "Lite index crc computed in %.3fms", timer.Elapsed() * 1000); |
| |
| return all_crc; |
| } |
| |
| libtextclassifier3::Status LiteIndex::Reset() { |
| IcingTimer timer; |
| |
| // TODO(b/140436942): When these components have been changed to return errors |
| // they should be propagated from here. |
| lexicon_.Clear(); |
| hit_buffer_.Clear(); |
| header_->Reset(); |
| UpdateChecksum(); |
| |
| ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index clear in %.3fms", |
| timer.Elapsed() * 1000); |
| return libtextclassifier3::Status::OK; |
| } |
| |
| void LiteIndex::Warm() { |
| hit_buffer_.Warm(); |
| lexicon_.Warm(); |
| } |
| |
| libtextclassifier3::Status LiteIndex::PersistToDisk() { |
| bool success = true; |
| if (!lexicon_.Sync()) { |
| ICING_VLOG(1) << "Failed to sync the lexicon."; |
| success = false; |
| } |
| hit_buffer_.Sync(); |
| UpdateChecksum(); |
| header_mmap_.Sync(); |
| |
| return (success) ? libtextclassifier3::Status::OK |
| : absl_ports::InternalError( |
| "Unable to sync lite index components."); |
| } |
| |
| void LiteIndex::UpdateChecksum() { |
| header_->set_lite_index_crc(ComputeChecksum().Get()); |
| } |
| |
| libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm( |
| const std::string& term, TermMatchType::Code term_match_type) { |
| uint32_t tvi; |
| if (!lexicon_.Insert(term.c_str(), "", &tvi, false)) { |
| return absl_ports::ResourceExhaustedError( |
| absl_ports::StrCat("Unable to add term ", term, " to lexicon!")); |
| } |
| ICING_RETURN_IF_ERROR(UpdateTerm(tvi, term_match_type)); |
| return tvi; |
| } |
| |
| libtextclassifier3::Status LiteIndex::UpdateTerm( |
| uint32_t tvi, TermMatchType::Code term_match_type) { |
| if (term_match_type != TermMatchType::PREFIX) { |
| return libtextclassifier3::Status::OK; |
| } |
| |
| if (!lexicon_.SetProperty(tvi, kHasHitsInPrefixSection)) { |
| return absl_ports::ResourceExhaustedError( |
| "Insufficient disk space to create property!"); |
| } |
| return libtextclassifier3::Status::OK; |
| } |
| |
| libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) { |
| if (is_full()) { |
| return absl_ports::ResourceExhaustedError("Hit buffer is full!"); |
| } |
| |
| header_->set_last_added_docid(hit.document_id()); |
| |
| Element elt(term_id, hit); |
| uint32_t cur_size = header_->cur_size(); |
| Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1); |
| if (valp == nullptr) { |
| return absl_ports::ResourceExhaustedError( |
| "Allocating more space in hit buffer failed!"); |
| } |
| *valp = elt.value(); |
| header_->set_cur_size(cur_size + 1); |
| |
| return libtextclassifier3::Status::OK; |
| } |
| |
| libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm( |
| const std::string& term) const { |
| char dummy; |
| uint32_t tvi; |
| if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) { |
| return absl_ports::NotFoundError( |
| absl_ports::StrCat("Could not find ", term, " in the lexicon.")); |
| } |
| return tvi; |
| } |
| |
| uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask, |
| bool only_from_prefix_sections, |
| std::vector<DocHitInfo>* hits_out) { |
| uint32_t count = 0; |
| DocumentId last_document_id = kInvalidDocumentId; |
| for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) { |
| Element elt(hit_buffer_.array_cast<Element>()[idx]); |
| if (elt.term_id() != term_id) break; |
| |
| const Hit& hit = elt.hit(); |
| // Check sections. |
| if (((1u << hit.section_id()) & section_id_mask) == 0) { |
| continue; |
| } |
| // Check prefix section only. |
| if (only_from_prefix_sections && !hit.is_in_prefix_section()) { |
| continue; |
| } |
| DocumentId document_id = hit.document_id(); |
| if (document_id != last_document_id) { |
| count++; |
| if (hits_out != nullptr) { |
| hits_out->push_back(DocHitInfo(document_id)); |
| } |
| last_document_id = document_id; |
| } |
| if (hits_out != nullptr) { |
| hits_out->back().UpdateSection(hit.section_id(), hit.score()); |
| } |
| } |
| return count; |
| } |
| |
| bool LiteIndex::is_full() const { |
| return (header_->cur_size() == options_.hit_buffer_size || |
| lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction)); |
| } |
| |
| void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const { |
| absl_ports::StrAppend( |
| out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n", |
| header_->cur_size(), |
| options_.hit_buffer_size)); |
| |
| // Lexicon. |
| out->append("Lexicon stats:\n"); |
| lexicon_.GetDebugInfo(verbosity, out); |
| } |
| |
| uint32_t LiteIndex::Seek(uint32_t term_id) { |
| // Make searchable by sorting by hit buffer. |
| uint32_t sort_len = header_->cur_size() - header_->searchable_end(); |
| if (sort_len > 0) { |
| IcingTimer timer; |
| |
| auto* array_start = |
| hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size()); |
| Element::Value* sort_start = array_start + header_->searchable_end(); |
| std::sort(sort_start, array_start + header_->cur_size()); |
| |
| // Now merge with previous region. Since the previous region is already |
| // sorted and deduplicated, optimize the merge by skipping everything less |
| // than the new region's smallest value. |
| if (header_->searchable_end() > 0) { |
| std::inplace_merge(array_start, array_start + header_->searchable_end(), |
| array_start + header_->cur_size()); |
| } |
| ICING_VLOG(2) << IcingStringUtil::StringPrintf( |
| "Lite index sort and merge %u into %u in %.3fms", sort_len, |
| header_->searchable_end(), timer.Elapsed() * 1000); |
| |
| // Now the entire array is sorted. |
| header_->set_searchable_end(header_->cur_size()); |
| |
| // Update crc in-line. |
| UpdateChecksum(); |
| } |
| |
| // Binary search for our term_id. Make sure we get the first |
| // element. Using kBeginSortValue ensures this for the hit value. |
| Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore)); |
| |
| const Element::Value* array = hit_buffer_.array_cast<Element::Value>(); |
| const Element::Value* ptr = |
| std::lower_bound(array, array + header_->cur_size(), elt.value()); |
| return ptr - array; |
| } |
| |
| } // namespace lib |
| } // namespace icing |