icing/index/main/main-index.cc - platform/external/icing - Git at Google

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "icing/index/main/main-index.h"

 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <string>
 #include <unordered_set>

 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/absl_ports/str_cat.h"
 #include "icing/file/destructible-directory.h"
 #include "icing/file/posting_list/flash-index-storage.h"
 #include "icing/file/posting_list/posting-list-common.h"
 #include "icing/index/main/posting-list-hit-serializer.h"
 #include "icing/index/term-id-codec.h"
 #include "icing/index/term-property-id.h"
 #include "icing/legacy/core/icing-string-util.h"
 #include "icing/legacy/index/icing-dynamic-trie.h"
 #include "icing/proto/debug.pb.h"
 #include "icing/proto/storage.pb.h"
 #include "icing/proto/term.pb.h"
 #include "icing/util/logging.h"
 #include "icing/util/status-macros.h"

 namespace icing {
 namespace lib {

 namespace {

 // Finds the shortest,valid prefix term with prefix hits in lexicon for which
 // "prefix" is a prefix.
 // Returns a valid FindTermResult with found=true if either:
 //   1. prefix exists as a term in lexicon.
 //   2. the shortest, valid prefix in the lexicon exists and contains prefix
 //      hits.
 // Returns a FindTermResult with found=false and undefined values of tvi and
 // exact if no term was found.
 struct FindTermResult {
   // TVI of the term that was found. Undefined if found=false.
   uint32_t tvi;
   // Whether or not a valid term with prefix hits was found.
   bool found;
   // Whether or not that term is equal to 'prefix'
   bool exact;
 };
 FindTermResult FindShortestValidTermWithPrefixHits(
     const IcingDynamicTrie* lexicon, const std::string& prefix) {
   // For prefix indexing: when we are doing a prefix match for "prefix", find
   // the tvi to the equivalent posting list. prefix's own posting list might not
   // exist but one of its children acts as a proxy.
   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
       *lexicon, GetHasHitsInPrefixSectionPropertyId());
   uint32_t tvi = 0;
   bool found = false;
   bool exact = false;
   for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
        it.Advance()) {
     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
     memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));

     // Posting list id might be invalid if this is also a backfill term.
     // Suppose that the main index has two pre-existing prefix hits "foot" and
     // "fool" - it will have a branch point posting list for "foo". Then, let's
     // suppose that the other index adds hits for "foul", "four" and "far". This
     // will result in branch points for "fo" and "f".
     // If "fo" was added before "f", then the iterator would first give us "fo".
     // "fo" will have an invalid posting_list_id because it hasn't been
     // backfilled yet, so we need to continue iterating to "foo".
     if (posting_list_id.is_valid()) {
       exact = (prefix.size() == strlen(it.GetKey()));
       tvi = it.GetValueIndex();
       // Found it. Does it have prefix hits?
       found = exact || hits_in_prefix_section.HasProperty(tvi);
       break;
     }
   }
   FindTermResult result = {tvi, found, exact};
   return result;
 }

 std::string MakeFlashIndexFilename(const std::string& base_dir) {
   return base_dir + "/main_index";
 }

 }  // namespace

 MainIndex::MainIndex(const std::string& index_directory,
                      const Filesystem* filesystem,
                      const IcingFilesystem* icing_filesystem)
     : base_dir_(index_directory),
       filesystem_(filesystem),
       icing_filesystem_(icing_filesystem),
       posting_list_hit_serializer_(
           std::make_unique<PostingListHitSerializer>()) {}

 libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
     const std::string& index_directory, const Filesystem* filesystem,
     const IcingFilesystem* icing_filesystem) {
   ICING_RETURN_ERROR_IF_NULL(filesystem);
   ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
   std::unique_ptr<MainIndex> main_index(
       new MainIndex(index_directory, filesystem, icing_filesystem));
   ICING_RETURN_IF_ERROR(main_index->Init());
   return main_index;
 }

 /* static */ libtextclassifier3::StatusOr<int> MainIndex::ReadFlashIndexMagic(
     const Filesystem* filesystem, const std::string& index_directory) {
   return FlashIndexStorage::ReadHeaderMagic(
       filesystem, MakeFlashIndexFilename(index_directory));
 }

 // TODO(b/139087650) : Migrate off of IcingFilesystem.
 libtextclassifier3::Status MainIndex::Init() {
   if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) {
     return absl_ports::InternalError("Unable to create main index directory.");
   }
   std::string flash_index_file = MakeFlashIndexFilename(base_dir_);
   ICING_ASSIGN_OR_RETURN(
       FlashIndexStorage flash_index,
       FlashIndexStorage::Create(flash_index_file, filesystem_,
                                 posting_list_hit_serializer_.get()));
   flash_index_storage_ =
       std::make_unique<FlashIndexStorage>(std::move(flash_index));

   std::string lexicon_file = base_dir_ + "/main-lexicon";
   IcingDynamicTrie::RuntimeOptions runtime_options;
   main_lexicon_ = std::make_unique<IcingDynamicTrie>(
       lexicon_file, runtime_options, icing_filesystem_);
   IcingDynamicTrie::Options lexicon_options;
   if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
       !main_lexicon_->Init()) {
     return absl_ports::InternalError("Failed to initialize lexicon trie");
   }
   return libtextclassifier3::Status::OK;
 }

 libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
   IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
   if (storage_info.main_index_storage_size() == -1 ||
       storage_info.main_index_lexicon_size() == -1) {
     return absl_ports::AbortedError(
         "Failed to get size of MainIndex's members.");
   }
   return storage_info.main_index_storage_size() +
          storage_info.main_index_lexicon_size();
 }

 IndexStorageInfoProto MainIndex::GetStorageInfo(
     IndexStorageInfoProto storage_info) const {
   storage_info.set_main_index_lexicon_size(
       IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
   storage_info.set_main_index_storage_size(
       Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
   storage_info.set_main_index_block_size(flash_index_storage_->block_size());
   storage_info.set_num_blocks(flash_index_storage_->num_blocks());
   storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
   return storage_info;
 }

 libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
 MainIndex::GetAccessorForExactTerm(const std::string& term) {
   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
   if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
         "Term %s is not present in main lexicon.", term.c_str()));
   }
   return PostingListHitAccessor::CreateFromExisting(
       flash_index_storage_.get(), posting_list_hit_serializer_.get(),
       posting_list_id);
 }

 libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
 MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
   bool exact = false;
   // For prefix indexing: when we are doing a prefix match for
   // "prefix", find the tvi to the equivalent posting list. prefix's
   // own posting list might not exist but its shortest child acts as a proxy.
   //
   // For example, if there are only two hits in the index are prefix hits for
   // "bar" and "bat", then both will appear on a posting list for "ba". "b"
   // won't have a posting list, but "ba" will suffice.
   IcingDynamicTrie::PropertyReader hits_in_prefix_section(
       *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
   IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
   if (!main_itr.IsValid()) {
     return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
         "Term: %s is not present in the main lexicon.", prefix.c_str()));
   }
   exact = (prefix.length() == strlen(main_itr.GetKey()));

   if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
     // Found it, but it doesn't have prefix hits. Exit early. No need to
     // retrieve the posting list because there's nothing there for us.
     return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
   }
   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
   memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
   ICING_ASSIGN_OR_RETURN(
       std::unique_ptr<PostingListHitAccessor> pl_accessor,
       PostingListHitAccessor::CreateFromExisting(
           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
           posting_list_id));
   return GetPrefixAccessorResult(std::move(pl_accessor), exact);
 }

 // TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
 bool IsTermInNamespaces(
     const IcingDynamicTrie::PropertyReadersAll& property_reader,
     uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
   if (namespace_ids.empty()) {
     return true;
   }
   for (NamespaceId namespace_id : namespace_ids) {
     if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
                                     value_index)) {
       return true;
     }
   }

   return false;
 }

 libtextclassifier3::StatusOr<std::vector<TermMetadata>>
 MainIndex::FindTermsByPrefix(
     const std::string& prefix, TermMatchType::Code scoring_match_type,
     SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
     const SuggestionResultChecker* suggestion_result_checker) {
   // Finds all the terms that start with the given prefix in the lexicon.
   IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());

   std::vector<TermMetadata> term_metadata_list;
   while (term_iterator.IsValid()) {
     int score = 0;
     DocumentId last_document_id = kInvalidDocumentId;
     bool is_last_document_in_desired = false;

     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
     memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
     ICING_ASSIGN_OR_RETURN(
         std::unique_ptr<PostingListHitAccessor> pl_accessor,
         PostingListHitAccessor::CreateFromExisting(
             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
             posting_list_id));
     ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
                            pl_accessor->GetNextHitsBatch());
     while (!hits.empty()) {
       for (const Hit& hit : hits) {
         // Check whether this Hit is desired.
         DocumentId document_id = hit.document_id();
         bool is_new_document = document_id != last_document_id;
         if (is_new_document) {
           last_document_id = document_id;
           is_last_document_in_desired =
               suggestion_result_checker->BelongsToTargetResults(
                   document_id, hit.section_id());
         }
         if (!is_last_document_in_desired) {
           // The document is removed or expired or not belongs to target
           // namespaces.
           continue;
         }
         if (scoring_match_type == TermMatchType::EXACT_ONLY &&
             hit.is_prefix_hit()) {
           continue;
         }

         // Score the hit by the strategy
         if (score_by ==
             SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE) {
           // Give 1 to all match terms and return them in arbitrary order
           score = 1;
           break;
         } else if (score_by == SuggestionScoringSpecProto::
                                    SuggestionRankingStrategy::DOCUMENT_COUNT &&
                    is_new_document) {
           ++score;
         } else if (score_by == SuggestionScoringSpecProto::
                                    SuggestionRankingStrategy::TERM_FREQUENCY) {
           if (hit.has_term_frequency()) {
             score += hit.term_frequency();
           } else {
             ++score;
           }
         }
       }
       if (score_by ==
               SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE &&
           score == 1) {
         // The term is desired and no need to be scored.
         break;
       }
       ICING_ASSIGN_OR_RETURN(hits, pl_accessor->GetNextHitsBatch());
     }
     if (score > 0) {
       term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), score));
     }

     term_iterator.Advance();
   }
   return term_metadata_list;
 }

 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
 MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
   // Maps new branching points in main lexicon to the term such that
   // branching_point_term is a prefix of term and there are no terms smaller
   // than term and greater than branching_point_term.
   std::string prefix;
   LexiconMergeOutputs outputs;
   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
        other_term_itr.IsValid(); other_term_itr.Advance()) {
     // If term were inserted in the main lexicon, what new branching would it
     // create? (It always creates at most one.)
     int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
         other_term_itr.GetKey(), /*utf8=*/true);
     if (prefix_len <= 0) {
       continue;
     }
     prefix.assign(other_term_itr.GetKey(), prefix_len);

     // Figure out backfill tvi. Might not exist since all children terms could
     // only contain hits from non-prefix sections.
     //
     // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
     // we're adding "foul". The new branching prefix will be "fo". The backfill
     // prefix will be "foo" - all hits in prefix section on "foo" will need to
     // be added to the new "fo" posting list later.
     FindTermResult result =
         FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
     if (!result.found || result.exact) {
       continue;
     }

     // This is a new prefix that will need backfilling from its next-in-line
     // posting list. This new prefix will have to have a posting list eventually
     // so insert a default PostingListIdentifier as a placeholder.
     uint32_t branching_prefix_tvi;
     bool new_key;
     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
     libtextclassifier3::Status status = main_lexicon_->Insert(
         prefix.c_str(), &posting_list_id, &branching_prefix_tvi,
         /*replace=*/false, &new_key);
     if (!status.ok()) {
       ICING_LOG(DBG) << "Could not insert branching prefix\n"
                      << status.error_message();
       return status;
     }

     // Backfills only contain prefix hits by default. So set these here but
     // could be overridden when adding hits from the other index later.
     if (!main_lexicon_->SetProperty(branching_prefix_tvi,
                                     GetHasNoExactHitsPropertyId()) ||
         !main_lexicon_->SetProperty(branching_prefix_tvi,
                                     GetHasHitsInPrefixSectionPropertyId())) {
       return absl_ports::InternalError("Setting prefix prop failed");
     }

     outputs.backfill_map[branching_prefix_tvi] = result.tvi;
   }
   return outputs;
 }

 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
 MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
                     LexiconMergeOutputs&& outputs) {
   IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
        other_term_itr.IsValid(); other_term_itr.Advance()) {
     uint32_t new_main_tvi;
     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
     libtextclassifier3::Status status = main_lexicon_->Insert(
         other_term_itr.GetKey(), &posting_list_id, &new_main_tvi,
         /*replace=*/false);
     if (!status.ok()) {
       ICING_LOG(DBG) << "Could not insert term: " << other_term_itr.GetKey()
                      << "\n"
                      << status.error_message();
       return status;
     }

     // Copy the properties from the other lexicon over to the main lexicon.
     uint32_t other_tvi = other_term_itr.GetValueIndex();
     if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
                         new_main_tvi)) {
       return absl_ports::InternalError(absl_ports::StrCat(
           "Could not insert term: ", other_term_itr.GetKey()));
     }

     // Add other to main mapping.
     outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);

     memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(new_main_tvi),
            sizeof(posting_list_id));
     if (posting_list_id.block_index() != kInvalidBlockIndex) {
       outputs.main_tvi_to_block_index[new_main_tvi] =
           posting_list_id.block_index();
     }
   }
   return std::move(outputs);
 }

 libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
 MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
                            LexiconMergeOutputs&& outputs) {
   IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
       other_lexicon, GetHasHitsInPrefixSectionPropertyId());
   if (!has_prefix_prop_reader.Exists()) {
     return std::move(outputs);
   }
   std::string prefix;
   for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
        other_term_itr.IsValid(); other_term_itr.Advance()) {
     // Only expand terms that have hits in prefix sections.
     if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
       continue;
     }

     // Get prefixes where there is already a branching point in the main
     // lexicon. We skip prefixes which don't already have a branching point.
     std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
         other_term_itr.GetKey(), /*utf8=*/true);

     int buf_start = outputs.prefix_tvis_buf.size();
     // Add prefixes.
     for (int prefix_length : prefix_lengths) {
       if (prefix_length <= 0) {
         continue;
       }

       prefix.assign(other_term_itr.GetKey(), prefix_length);
       uint32_t prefix_tvi;
       bool new_key;
       PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
       libtextclassifier3::Status status =
           main_lexicon_->Insert(prefix.c_str(), &posting_list_id, &prefix_tvi,
                                 /*replace=*/false, &new_key);
       if (!status.ok()) {
         ICING_LOG(DBG) << "Could not insert prefix: " << prefix << "\n"
                        << status.error_message();
         return status;
       }

       // Prefix tvi will have hits in prefix section.
       if (!main_lexicon_->SetProperty(prefix_tvi,
                                       GetHasHitsInPrefixSectionPropertyId())) {
         return absl_ports::InternalError(
             "Setting has hits in prefix section prop failed");
       }

       // If it hasn't been added by non-prefix term insertions in
       // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
       if (new_key && !main_lexicon_->SetProperty(
                          prefix_tvi, GetHasNoExactHitsPropertyId())) {
         return absl_ports::InternalError("Setting no exact hits prop failed");
       }

       outputs.prefix_tvis_buf.push_back(prefix_tvi);

       memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(prefix_tvi),
              sizeof(posting_list_id));
       if (posting_list_id.block_index() != kInvalidBlockIndex) {
         outputs.main_tvi_to_block_index[prefix_tvi] =
             posting_list_id.block_index();
       }
     }

     // Any prefixes added? Then add to map.
     if (buf_start < outputs.prefix_tvis_buf.size()) {
       outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
           buf_start, outputs.prefix_tvis_buf.size() - buf_start};
     }
   }
   return std::move(outputs);
 }

 bool MainIndex::CopyProperties(
     const IcingDynamicTrie::PropertyReadersAll& prop_reader,
     const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
     uint32_t new_main_tvi) {
   for (uint32_t property_id = 0; property_id < prop_reader.size();
        ++property_id) {
     if (property_id == GetHasNoExactHitsPropertyId()) {
       // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
       // for this term, then HasNoExactHits needs to be set to false in
       // main_lexicon. If other_lexicon has no exact hits for this term, then
       // HasNoExactHits in the main_lexicon should not be modified.
       if (!prop_reader.HasProperty(property_id, other_tvi) &&
           !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
         ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
         return false;
       }
     } else {
       // If other_lexicon has this property set for this term, then that
       // property needs to be set for the main_lexicon. If other_lexicon
       // doesn't have this property set, then the property in the main lexicon
       // should not be modified.
       if (prop_reader.HasProperty(property_id, other_tvi) &&
           !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
         return false;
       }
     }
   }
   return true;
 }

 libtextclassifier3::Status MainIndex::AddHits(
     const TermIdCodec& term_id_codec,
     std::unordered_map<uint32_t, uint32_t>&& backfill_map,
     std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id) {
   if (hits.empty()) {
     flash_index_storage_->set_last_indexed_docid(last_added_document_id);
     return libtextclassifier3::Status::OK;
   }
   uint32_t cur_term_id = hits[0].term_id();
   ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
                          term_id_codec.DecodeTermInfo(cur_term_id));
   // Iterate through all hits. If these hits are for a term that also needs
   // backfill, then backfill first and then add the new hits.
   size_t k_start = 0;
   size_t k_end = 0;
   while (k_start < hits.size()) {
     uint32_t term_id = hits[k_end].term_id();
     while (term_id == cur_term_id && ++k_end < hits.size()) {
       term_id = hits[k_end].term_id();
     }

     // Look for backfill.
     PostingListIdentifier backfill_posting_list_id =
         PostingListIdentifier::kInvalid;
     auto itr = backfill_map.find(cur_decoded_term.tvi);
     if (itr != backfill_map.end()) {
       const void* value = main_lexicon_->GetValueAtIndex(itr->second);
       memcpy(&backfill_posting_list_id, value,
              sizeof(backfill_posting_list_id));
       backfill_map.erase(itr);
     }
     ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
                                          backfill_posting_list_id,
                                          &hits[k_start], k_end - k_start));
     cur_term_id = term_id;
     ICING_ASSIGN_OR_RETURN(cur_decoded_term,
                            term_id_codec.DecodeTermInfo(cur_term_id));
     k_start = k_end;
   }

   // Now copy remaining backfills.
   ICING_VLOG(1) << "Remaining backfills " << backfill_map.size();
   for (auto other_tvi_main_tvi_pair : backfill_map) {
     PostingListIdentifier backfill_posting_list_id =
         PostingListIdentifier::kInvalid;
     memcpy(&backfill_posting_list_id,
            main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
            sizeof(backfill_posting_list_id));
     ICING_ASSIGN_OR_RETURN(
         std::unique_ptr<PostingListHitAccessor> hit_accum,
         PostingListHitAccessor::Create(flash_index_storage_.get(),
                                        posting_list_hit_serializer_.get()));
     ICING_RETURN_IF_ERROR(
         AddPrefixBackfillHits(backfill_posting_list_id, hit_accum.get()));
     PostingListAccessor::FinalizeResult result =
         std::move(*hit_accum).Finalize();
     if (result.id.is_valid()) {
       main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
     }
   }
   flash_index_storage_->set_last_indexed_docid(last_added_document_id);
   return libtextclassifier3::Status::OK;
 }

 libtextclassifier3::Status MainIndex::AddHitsForTerm(
     uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
     const TermIdHitPair* hit_elements, size_t len) {
   // 1. Create a PostingListHitAccessor - either from the pre-existing block, if
   // one exists, or from scratch.
   PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
   memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
          sizeof(posting_list_id));
   std::unique_ptr<PostingListHitAccessor> pl_accessor;
   if (posting_list_id.is_valid()) {
     if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
       ICING_LOG(ERROR) << "Index dropped hits. Invalid block index "
                        << posting_list_id.block_index()
                        << " >= " << flash_index_storage_->num_blocks();
       // TODO(b/159918304) : Consider revising the checksumming strategy in the
       // main index. Providing some mechanism to check for corruption - either
       // during initialization or some later time would allow us to avoid
       // whack-a-mole with odd corruption issues like this one (b/62820689).
       return absl_ports::InternalError(
           "Valid posting list has an invalid block index!");
     }
     ICING_ASSIGN_OR_RETURN(
         pl_accessor, PostingListHitAccessor::CreateFromExisting(
                          flash_index_storage_.get(),
                          posting_list_hit_serializer_.get(), posting_list_id));
   } else {
     // New posting list.
     ICING_ASSIGN_OR_RETURN(
         pl_accessor,
         PostingListHitAccessor::Create(flash_index_storage_.get(),
                                        posting_list_hit_serializer_.get()));
   }

   // 2. Backfill any hits if necessary.
   if (backfill_posting_list_id.is_valid()) {
     ICING_RETURN_IF_ERROR(
         AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
   }

   // 3. Add all the new hits.
   for (int i = len - 1; i >= 0; --i) {
     Hit hit = hit_elements[i].hit();
     ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
   }

   // 4. Finalize this posting list and put its identifier in the lexicon.
   PostingListAccessor::FinalizeResult result =
       std::move(*pl_accessor).Finalize();
   if (result.id.is_valid()) {
     main_lexicon_->SetValueAtIndex(tvi, &result.id);
   }
   return libtextclassifier3::Status::OK;
 }

 libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
     PostingListIdentifier backfill_posting_list_id,
     PostingListHitAccessor* hit_accum) {
   ICING_ASSIGN_OR_RETURN(
       std::unique_ptr<PostingListHitAccessor> backfill_accessor,
       PostingListHitAccessor::CreateFromExisting(
           flash_index_storage_.get(), posting_list_hit_serializer_.get(),
           backfill_posting_list_id));
   std::vector<Hit> backfill_hits;
   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
                          backfill_accessor->GetNextHitsBatch());
   while (!tmp.empty()) {
     std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
     ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch());
   }

   Hit last_added_hit(Hit::kInvalidValue);
   // The hits in backfill_hits are in the reverse order of how they were added.
   // Iterate in reverse to add them to this new posting list in the correct
   // order.
   for (auto itr = backfill_hits.rbegin(); itr != backfill_hits.rend(); ++itr) {
     const Hit& hit = *itr;
     // Skip hits from non-prefix-enabled sections.
     if (!hit.is_in_prefix_section()) {
       continue;
     }

     // A backfill hit is a prefix hit in a prefix section.
     const Hit backfill_hit(hit.section_id(), hit.document_id(),
                            hit.term_frequency(),
                            /*is_in_prefix_section=*/true,
                            /*is_prefix_hit=*/true);
     if (backfill_hit == last_added_hit) {
       // Skip duplicate values due to overriding of the is_prefix flag.
       continue;
     }
     last_added_hit = backfill_hit;
     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
   }
   return libtextclassifier3::Status::OK;
 }

 std::string MainIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
   std::string res;

   // Lexicon.
   std::string lexicon_info;
   main_lexicon_->GetDebugInfo(verbosity, &lexicon_info);

   IcingStringUtil::SStringAppendF(&res, 0,
                                   "last_added_document_id: %u\n"
                                   "\n"
                                   "main_lexicon_info:\n%s\n",
                                   last_added_document_id(),
                                   lexicon_info.c_str());

   if (verbosity == DebugInfoVerbosity::BASIC) {
     return res;
   }

   std::string flash_index_storage_info;
   flash_index_storage_->GetDebugInfo(verbosity, &flash_index_storage_info);
   IcingStringUtil::SStringAppendF(&res, 0, "flash_index_storage_info:\n%s\n",
                                   flash_index_storage_info.c_str());
   return res;
 }

 libtextclassifier3::Status MainIndex::Optimize(
     const std::vector<DocumentId>& document_id_old_to_new) {
   std::string temporary_index_dir_path = base_dir_ + "_temp";
   if (!filesystem_->DeleteDirectoryRecursively(
           temporary_index_dir_path.c_str())) {
     ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_dir_path;
     return absl_ports::InternalError(
         "Unable to delete temp directory to prepare to build new index.");
   }

   DestructibleDirectory temporary_index_dir(
       filesystem_, std::move(temporary_index_dir_path));
   if (!temporary_index_dir.is_valid()) {
     return absl_ports::InternalError(
         "Unable to create temp directory to build new index.");
   }

   ICING_ASSIGN_OR_RETURN(std::unique_ptr<MainIndex> new_index,
                          MainIndex::Create(temporary_index_dir.dir(),
                                            filesystem_, icing_filesystem_));
   ICING_RETURN_IF_ERROR(TransferIndex(document_id_old_to_new, new_index.get()));
   ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
   new_index = nullptr;
   flash_index_storage_ = nullptr;
   main_lexicon_ = nullptr;

   if (!filesystem_->SwapFiles(temporary_index_dir.dir().c_str(),
                               base_dir_.c_str())) {
     return absl_ports::InternalError(
         "Unable to apply new index due to failed swap!");
   }

   // Reinitialize the index so that flash_index_storage_ and main_lexicon_ are
   // properly updated.
   return Init();
 }

 libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
     const std::vector<DocumentId>& document_id_old_to_new, const char* term,
     PostingListHitAccessor& old_pl_accessor, MainIndex* new_index) {
   std::vector<Hit> new_hits;
   bool has_no_exact_hits = true;
   bool has_hits_in_prefix_section = false;
   // The largest document id after translating hits.
   DocumentId largest_document_id = kInvalidDocumentId;
   ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
                          old_pl_accessor.GetNextHitsBatch());
   while (!tmp.empty()) {
     for (const Hit& hit : tmp) {
       // A safety check to add robustness to the codebase, so to make sure that
       // we never access invalid memory, in case that hit from the posting list
       // is corrupted.
       if (hit.document_id() < 0 ||
           hit.document_id() >= document_id_old_to_new.size()) {
         continue;
       }
       DocumentId new_document_id = document_id_old_to_new[hit.document_id()];
       // Transfer the document id of the hit, if the document is not deleted
       // or outdated.
       if (new_document_id != kInvalidDocumentId) {
         if (hit.is_in_prefix_section()) {
           has_hits_in_prefix_section = true;
         }
         if (!hit.is_prefix_hit()) {
           has_no_exact_hits = false;
         }
         if (largest_document_id == kInvalidDocumentId ||
             new_document_id > largest_document_id) {
           largest_document_id = new_document_id;
         }
         new_hits.push_back(Hit::TranslateHit(hit, new_document_id));
       }
     }
     ICING_ASSIGN_OR_RETURN(tmp, old_pl_accessor.GetNextHitsBatch());
   }
   // A term without exact hits indicates that it is a purely backfill term. If
   // the term is not branching in the new trie, it means backfilling is no
   // longer necessary, so that we can skip.
   if (new_hits.empty() ||
       (has_no_exact_hits && !new_index->main_lexicon_->IsBranchingTerm(term))) {
     return largest_document_id;
   }

   ICING_ASSIGN_OR_RETURN(std::unique_ptr<PostingListHitAccessor> hit_accum,
                          PostingListHitAccessor::Create(
                              new_index->flash_index_storage_.get(),
                              new_index->posting_list_hit_serializer_.get()));
   for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
     ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*itr));
   }
   PostingListAccessor::FinalizeResult result = std::move(*hit_accum).Finalize();
   if (!result.id.is_valid()) {
     return absl_ports::InternalError(
         absl_ports::StrCat("Failed to add translated hits for term: ", term));
   }
   uint32_t tvi;
   libtextclassifier3::Status status =
       new_index->main_lexicon_->Insert(term, &result.id, &tvi,
                                        /*replace=*/false);
   if (!status.ok()) {
     ICING_LOG(DBG) << "Could not transfer main index for term: " << term << "\n"
                    << status.error_message();
     return status;
   }
   if (has_no_exact_hits && !new_index->main_lexicon_->SetProperty(
                                tvi, GetHasNoExactHitsPropertyId())) {
     return absl_ports::InternalError("Setting prefix prop failed");
   }
   if (has_hits_in_prefix_section &&
       !new_index->main_lexicon_->SetProperty(
           tvi, GetHasHitsInPrefixSectionPropertyId())) {
     return absl_ports::InternalError("Setting prefix prop failed");
   }
   return largest_document_id;
 }

 libtextclassifier3::Status MainIndex::TransferIndex(
     const std::vector<DocumentId>& document_id_old_to_new,
     MainIndex* new_index) {
   DocumentId largest_document_id = kInvalidDocumentId;
   for (IcingDynamicTrie::Iterator term_itr(*main_lexicon_, /*prefix=*/"",
                                            /*reverse=*/true);
        term_itr.IsValid(); term_itr.Advance()) {
     PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
     memcpy(&posting_list_id, term_itr.GetValue(), sizeof(posting_list_id));
     if (posting_list_id == PostingListIdentifier::kInvalid) {
       // Why?
       ICING_LOG(ERROR)
           << "Got invalid posting_list_id from previous main index";
       continue;
     }
     ICING_ASSIGN_OR_RETURN(
         std::unique_ptr<PostingListHitAccessor> pl_accessor,
         PostingListHitAccessor::CreateFromExisting(
             flash_index_storage_.get(), posting_list_hit_serializer_.get(),
             posting_list_id));
     ICING_ASSIGN_OR_RETURN(
         DocumentId curr_largest_document_id,
         TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
                            *pl_accessor, new_index));
     if (curr_largest_document_id == kInvalidDocumentId) {
       continue;
     }
     if (largest_document_id == kInvalidDocumentId ||
         curr_largest_document_id > largest_document_id) {
       largest_document_id = curr_largest_document_id;
     }
   }
   new_index->flash_index_storage_->set_last_indexed_docid(largest_document_id);
   return libtextclassifier3::Status::OK;
 }

 }  // namespace lib
 }  // namespace icing