| // Copyright (C) 2019 Google LLC |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "icing/index/index.h" |
| |
| #include <algorithm> |
| #include <cstddef> |
| #include <cstdint> |
| #include <memory> |
| #include <string> |
| #include <utility> |
| #include <vector> |
| |
| #include "icing/text_classifier/lib3/utils/base/status.h" |
| #include "icing/text_classifier/lib3/utils/base/statusor.h" |
| #include "icing/absl_ports/canonical_errors.h" |
| #include "icing/absl_ports/str_cat.h" |
| #include "icing/file/filesystem.h" |
| #include "icing/index/hit/hit.h" |
| #include "icing/index/iterator/doc-hit-info-iterator-or.h" |
| #include "icing/index/iterator/doc-hit-info-iterator.h" |
| #include "icing/index/lite/doc-hit-info-iterator-term-lite.h" |
| #include "icing/index/lite/lite-index.h" |
| #include "icing/index/main/doc-hit-info-iterator-term-main.h" |
| #include "icing/index/main/main-index.h" |
| #include "icing/index/term-id-codec.h" |
| #include "icing/index/term-metadata.h" |
| #include "icing/legacy/core/icing-string-util.h" |
| #include "icing/legacy/index/icing-dynamic-trie.h" |
| #include "icing/legacy/index/icing-filesystem.h" |
| #include "icing/proto/scoring.pb.h" |
| #include "icing/proto/storage.pb.h" |
| #include "icing/proto/term.pb.h" |
| #include "icing/schema/section.h" |
| #include "icing/scoring/ranker.h" |
| #include "icing/store/document-id.h" |
| #include "icing/store/suggestion-result-checker.h" |
| #include "icing/util/logging.h" |
| #include "icing/util/status-macros.h" |
| |
| namespace icing { |
| namespace lib { |
| |
| namespace { |
| |
| libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions( |
| const Index::Options& options) { |
| if (options.index_merge_size <= 0) { |
| return absl_ports::InvalidArgumentError( |
| "Requested hit buffer size must be greater than 0."); |
| } |
| if (options.index_merge_size > LiteIndex::max_hit_buffer_size()) { |
| return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( |
| "Requested hit buffer size %d is too large.", |
| options.index_merge_size)); |
| } |
| return LiteIndex::Options( |
| options.base_dir + "/idx/lite.", options.index_merge_size, |
| options.lite_index_sort_at_indexing, options.lite_index_sort_size); |
| } |
| |
| std::string MakeMainIndexFilepath(const std::string& base_dir) { |
| return base_dir + "/idx/main"; |
| } |
| |
| IcingDynamicTrie::Options GetMainLexiconOptions() { |
| // The default values for IcingDynamicTrie::Options is fine for the main |
| // lexicon. |
| return IcingDynamicTrie::Options(); |
| } |
| |
| enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms }; |
| |
| // Merge the TermMetadata from lite index and main index. If the term exists in |
| // both index, sum up its hit count and push it to the term heap. |
| // The heap is a min-heap. So that we can avoid some push operation but the time |
| // complexity is O(NlgK) which N is total number of term and K is num_to_return. |
| std::vector<TermMetadata> MergeAndRankTermMetadatas( |
| std::vector<TermMetadata> lite_term_metadata_list, |
| std::vector<TermMetadata> main_term_metadata_list, int num_to_return) { |
| std::vector<TermMetadata> merged_term_metadata_heap; |
| merged_term_metadata_heap.reserve( |
| std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(), |
| static_cast<size_t>(num_to_return))); |
| |
| auto lite_term_itr = lite_term_metadata_list.begin(); |
| auto main_term_itr = main_term_metadata_list.begin(); |
| MergeAction merge_action; |
| while (lite_term_itr != lite_term_metadata_list.end() || |
| main_term_itr != main_term_metadata_list.end()) { |
| // Get pointers to the next metadatas in each group, if available |
| // Determine how to merge. |
| if (main_term_itr == main_term_metadata_list.end()) { |
| merge_action = MergeAction::kTakeLiteTerm; |
| } else if (lite_term_itr == lite_term_metadata_list.end()) { |
| merge_action = MergeAction::kTakeMainTerm; |
| } else if (lite_term_itr->content < main_term_itr->content) { |
| merge_action = MergeAction::kTakeLiteTerm; |
| } else if (main_term_itr->content < lite_term_itr->content) { |
| merge_action = MergeAction::kTakeMainTerm; |
| } else { |
| // The next metadatas refer to the same term. Combine them. |
| merge_action = MergeAction::kMergeTerms; |
| } |
| switch (merge_action) { |
| case MergeAction::kTakeLiteTerm: |
| PushToTermHeap(std::move(*lite_term_itr), num_to_return, |
| merged_term_metadata_heap); |
| ++lite_term_itr; |
| break; |
| case MergeAction::kTakeMainTerm: |
| PushToTermHeap(std::move(*main_term_itr), num_to_return, |
| merged_term_metadata_heap); |
| ++main_term_itr; |
| break; |
| case MergeAction::kMergeTerms: |
| int total_est_hit_count = lite_term_itr->score + main_term_itr->score; |
| PushToTermHeap(TermMetadata(std::move(lite_term_itr->content), |
| total_est_hit_count), |
| num_to_return, merged_term_metadata_heap); |
| ++lite_term_itr; |
| ++main_term_itr; |
| break; |
| } |
| } |
| // Reverse the list since we pop them from a min heap and we need to return in |
| // decreasing order. |
| std::vector<TermMetadata> merged_term_metadata_list = |
| PopAllTermsFromHeap(merged_term_metadata_heap); |
| std::reverse(merged_term_metadata_list.begin(), |
| merged_term_metadata_list.end()); |
| return merged_term_metadata_list; |
| } |
| |
| } // namespace |
| |
| libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create( |
| const Options& options, const Filesystem* filesystem, |
| const IcingFilesystem* icing_filesystem) { |
| ICING_RETURN_ERROR_IF_NULL(filesystem); |
| ICING_RETURN_ERROR_IF_NULL(icing_filesystem); |
| |
| ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options, |
| CreateLiteIndexOptions(options)); |
| ICING_ASSIGN_OR_RETURN( |
| std::unique_ptr<TermIdCodec> term_id_codec, |
| TermIdCodec::Create( |
| IcingDynamicTrie::max_value_index(GetMainLexiconOptions()), |
| IcingDynamicTrie::max_value_index( |
| lite_index_options.lexicon_options))); |
| |
| ICING_ASSIGN_OR_RETURN( |
| std::unique_ptr<LiteIndex> lite_index, |
| LiteIndex::Create(lite_index_options, icing_filesystem)); |
| // Sort the lite index if we've enabled sorting the HitBuffer at indexing |
| // time, and there's an unsorted tail exceeding the threshold. |
| if (options.lite_index_sort_at_indexing && |
| lite_index->HasUnsortedHitsExceedingSortThreshold()) { |
| lite_index->SortHits(); |
| } |
| |
| ICING_ASSIGN_OR_RETURN( |
| std::unique_ptr<MainIndex> main_index, |
| MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem, |
| icing_filesystem)); |
| return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec), |
| std::move(lite_index), |
| std::move(main_index), filesystem)); |
| } |
| |
| /* static */ libtextclassifier3::StatusOr<int> Index::ReadFlashIndexMagic( |
| const Filesystem* filesystem, const std::string& base_dir) { |
| return MainIndex::ReadFlashIndexMagic(filesystem, |
| MakeMainIndexFilepath(base_dir)); |
| } |
| |
| libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) { |
| if (lite_index_->last_added_document_id() != kInvalidDocumentId && |
| lite_index_->last_added_document_id() > document_id) { |
| ICING_VLOG(1) << "Clipping to " << document_id |
| << ". Throwing out lite index which is at " |
| << lite_index_->last_added_document_id(); |
| ICING_RETURN_IF_ERROR(lite_index_->Reset()); |
| } |
| if (main_index_->last_added_document_id() != kInvalidDocumentId && |
| main_index_->last_added_document_id() > document_id) { |
| ICING_VLOG(1) << "Clipping to " << document_id |
| << ". Throwing out lite index which is at " |
| << main_index_->last_added_document_id(); |
| ICING_RETURN_IF_ERROR(main_index_->Reset()); |
| } |
| return libtextclassifier3::Status::OK; |
| } |
| |
| libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> |
| Index::GetIterator(const std::string& term, int term_start_index, |
| int unnormalized_term_length, SectionIdMask section_id_mask, |
| TermMatchType::Code term_match_type, |
| bool need_hit_term_frequency) { |
| std::unique_ptr<DocHitInfoIterator> lite_itr; |
| std::unique_ptr<DocHitInfoIterator> main_itr; |
| switch (term_match_type) { |
| case TermMatchType::EXACT_ONLY: |
| lite_itr = std::make_unique<DocHitInfoIteratorTermLiteExact>( |
| term_id_codec_.get(), lite_index_.get(), term, term_start_index, |
| unnormalized_term_length, section_id_mask, need_hit_term_frequency); |
| main_itr = std::make_unique<DocHitInfoIteratorTermMainExact>( |
| main_index_.get(), term, term_start_index, unnormalized_term_length, |
| section_id_mask, need_hit_term_frequency); |
| break; |
| case TermMatchType::PREFIX: |
| lite_itr = std::make_unique<DocHitInfoIteratorTermLitePrefix>( |
| term_id_codec_.get(), lite_index_.get(), term, term_start_index, |
| unnormalized_term_length, section_id_mask, need_hit_term_frequency); |
| main_itr = std::make_unique<DocHitInfoIteratorTermMainPrefix>( |
| main_index_.get(), term, term_start_index, unnormalized_term_length, |
| section_id_mask, need_hit_term_frequency); |
| break; |
| default: |
| return absl_ports::InvalidArgumentError( |
| absl_ports::StrCat("Invalid TermMatchType: ", |
| TermMatchType::Code_Name(term_match_type))); |
| } |
| return std::make_unique<DocHitInfoIteratorOr>(std::move(lite_itr), |
| std::move(main_itr)); |
| } |
| |
| libtextclassifier3::StatusOr<std::vector<TermMetadata>> |
| Index::FindLiteTermsByPrefix( |
| const std::string& prefix, |
| SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by, |
| const SuggestionResultChecker* suggestion_result_checker) { |
| // Finds all the terms that start with the given prefix in the lexicon. |
| IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(), |
| prefix.c_str()); |
| |
| std::vector<TermMetadata> term_metadata_list; |
| while (term_iterator.IsValid()) { |
| uint32_t term_value_index = term_iterator.GetValueIndex(); |
| |
| ICING_ASSIGN_OR_RETURN( |
| uint32_t term_id, |
| term_id_codec_->EncodeTvi(term_value_index, TviType::LITE), |
| absl_ports::InternalError("Failed to access terms in lexicon.")); |
| ICING_ASSIGN_OR_RETURN( |
| int hit_score, |
| lite_index_->ScoreHits(term_id, score_by, suggestion_result_checker)); |
| if (hit_score > 0) { |
| // There is at least one document in the given namespace has this term. |
| term_metadata_list.push_back( |
| TermMetadata(term_iterator.GetKey(), hit_score)); |
| } |
| |
| term_iterator.Advance(); |
| } |
| return term_metadata_list; |
| } |
| |
| libtextclassifier3::StatusOr<std::vector<TermMetadata>> |
| Index::FindTermsByPrefix( |
| const std::string& prefix, int num_to_return, |
| TermMatchType::Code scoring_match_type, |
| SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by, |
| const SuggestionResultChecker* suggestion_result_checker) { |
| std::vector<TermMetadata> term_metadata_list; |
| if (num_to_return <= 0) { |
| return term_metadata_list; |
| } |
| // Get results from the LiteIndex. |
| // TODO(b/250648165) support score term by prefix_hit in lite_index. |
| ICING_ASSIGN_OR_RETURN( |
| std::vector<TermMetadata> lite_term_metadata_list, |
| FindLiteTermsByPrefix(prefix, rank_by, suggestion_result_checker)); |
| // Append results from the MainIndex. |
| ICING_ASSIGN_OR_RETURN( |
| std::vector<TermMetadata> main_term_metadata_list, |
| main_index_->FindTermsByPrefix(prefix, scoring_match_type, rank_by, |
| suggestion_result_checker)); |
| return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list), |
| std::move(main_term_metadata_list), |
| num_to_return); |
| } |
| |
| IndexStorageInfoProto Index::GetStorageInfo() const { |
| IndexStorageInfoProto storage_info; |
| int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str()); |
| storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size)); |
| storage_info = lite_index_->GetStorageInfo(std::move(storage_info)); |
| return main_index_->GetStorageInfo(std::move(storage_info)); |
| } |
| |
| libtextclassifier3::Status Index::Optimize( |
| const std::vector<DocumentId>& document_id_old_to_new, |
| DocumentId new_last_added_document_id) { |
| if (main_index_->last_added_document_id() != kInvalidDocumentId) { |
| ICING_RETURN_IF_ERROR(main_index_->Optimize(document_id_old_to_new)); |
| } |
| return lite_index_->Optimize(document_id_old_to_new, term_id_codec_.get(), |
| new_last_added_document_id); |
| } |
| |
| libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) { |
| // Step 1: See if this term is already in the lexicon |
| uint32_t tvi; |
| auto tvi_or = lite_index_->GetTermId(term); |
| |
| // Step 2: Update the lexicon, either add the term or update its properties |
| if (tvi_or.ok()) { |
| tvi = tvi_or.ValueOrDie(); |
| if (seen_tokens_.find(tvi) != seen_tokens_.end()) { |
| ICING_VLOG(1) << "Updating term frequency for term " << term; |
| if (seen_tokens_[tvi] != Hit::kMaxTermFrequency) { |
| ++seen_tokens_[tvi]; |
| } |
| return libtextclassifier3::Status::OK; |
| } |
| ICING_VLOG(1) << "Term " << term |
| << " is already present in lexicon. Updating."; |
| // Already in the lexicon. Just update the properties. |
| ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties( |
| tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_)); |
| } else { |
| ICING_VLOG(1) << "Term " << term << " is not in lexicon. Inserting."; |
| // Haven't seen this term before. Add it to the lexicon. |
| ICING_ASSIGN_OR_RETURN( |
| tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_)); |
| } |
| // Token seen for the first time in the current document. |
| seen_tokens_[tvi] = 1; |
| return libtextclassifier3::Status::OK; |
| } |
| |
| libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() { |
| for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) { |
| Hit hit(section_id_, document_id_, /*term_frequency=*/itr->second, |
| /*is_in_prefix_section=*/term_match_type_ == TermMatchType::PREFIX, |
| /*is_prefix_hit=*/false); |
| ICING_ASSIGN_OR_RETURN( |
| uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE)); |
| ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit)); |
| } |
| return libtextclassifier3::Status::OK; |
| } |
| |
| } // namespace lib |
| } // namespace icing |