icing/index/main/main-index_test.cc - platform/external/icing - Git at Google

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include "icing/index/main/main-index.h"

 #include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>

 #include "icing/text_classifier/lib3/utils/base/status.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/file/filesystem.h"
 #include "icing/index/hit/doc-hit-info.h"
 #include "icing/index/hit/hit.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/index/lite/lite-index.h"
 #include "icing/index/lite/term-id-hit-pair.h"
 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
 #include "icing/index/main/main-index-merger.h"
 #include "icing/index/term-id-codec.h"
 #include "icing/legacy/index/icing-dynamic-trie.h"
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/legacy/index/icing-mock-filesystem.h"
 #include "icing/schema/section.h"
 #include "icing/store/document-id.h"
 #include "icing/store/namespace-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/tmp-directory.h"
 #include "icing/util/status-macros.h"

 namespace icing {
 namespace lib {

 namespace {

 using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::IsEmpty;
 using ::testing::NiceMock;
 using ::testing::Return;
 using ::testing::SizeIs;

 std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
   std::vector<DocHitInfo> infos;
   while (iterator->Advance().ok()) {
     infos.push_back(iterator->doc_hit_info());
   }
   return infos;
 }

 std::vector<DocHitInfo> GetExactHits(
     MainIndex* main_index, int term_start_index, int unnormalized_term_length,
     const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
   auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
       main_index, term, term_start_index, unnormalized_term_length,
       section_mask, /*need_hit_term_frequency=*/true);
   return GetHits(std::move(iterator));
 }

 std::vector<DocHitInfo> GetPrefixHits(
     MainIndex* main_index, int term_start_index, int unnormalized_term_length,
     const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
   auto iterator = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
       main_index, term, term_start_index, unnormalized_term_length,
       section_mask, /*need_hit_term_frequency=*/true);
   return GetHits(std::move(iterator));
 }

 libtextclassifier3::Status Merge(const LiteIndex& lite_index,
                                  const TermIdCodec& term_id_codec,
                                  MainIndex* main_index) {
   ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
                          main_index->MergeLexicon(lite_index.lexicon()));
   ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
                          MainIndexMerger::TranslateAndExpandLiteHits(
                              lite_index, term_id_codec, outputs));
   return main_index->AddHits(term_id_codec, std::move(outputs.backfill_map),
                              std::move(term_id_hit_pairs),
                              lite_index.last_added_document_id());
 }

 class MainIndexTest : public testing::Test {
  protected:
   void SetUp() override {
     index_dir_ = GetTestTempDir() + "/test_dir";
     ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));

     std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
     LiteIndex::Options options(lite_index_file_name,
                                /*hit_buffer_want_merge_bytes=*/1024 * 1024,
                                /*hit_buffer_sort_at_indexing=*/true,
                                /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
     ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
                                LiteIndex::Create(options, &icing_filesystem_));

     ICING_ASSERT_OK_AND_ASSIGN(
         term_id_codec_,
         TermIdCodec::Create(
             IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
             IcingDynamicTrie::max_value_index(options.lexicon_options)));
   }

   void TearDown() override {
     term_id_codec_.reset();
     lite_index_.reset();
     ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
   }

   std::string index_dir_;
   Filesystem filesystem_;
   IcingFilesystem icing_filesystem_;
   std::unique_ptr<LiteIndex> lite_index_;
   std::unique_ptr<TermIdCodec> term_id_codec_;
 };

 constexpr NamespaceId kNamespace0 = 0;

 TEST_F(MainIndexTest, MainIndexCreateIOFailure) {
   // Create the index with mock filesystem. By default, Mock will return false,
   // so the first attempted file operation will fail.
   NiceMock<IcingMockFilesystem> mock_icing_filesystem;
   ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively)
       .WillByDefault(Return(false));
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   EXPECT_THAT(MainIndex::Create(main_index_file_name, &filesystem_,
                                 &mock_icing_filesystem),
               StatusIs(libtextclassifier3::StatusCode::INTERNAL));
 }

 TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixTermNotFound) {
   // Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));
   EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }

 TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=true}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the index. The main index should contain "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // GetAccessorForPrefixTerm should return a valid accessor for "foo".
   EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
 }

 TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=false}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the index. The main index should return not found when we search
   // prefix contain "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // GetAccessorForPrefixTerm should return a valid accessor for "foo".
   EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }

 TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
   // Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));
   EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"),
               StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
 }

 TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foo" is_in_prefix_section=false}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the index. The main index should contain "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // GetAccessorForPrefixTerm should return a valid accessor for "foo".
   EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"), IsOk());
 }

 TEST_F(MainIndexTest, MergeIndexToEmpty) {
   // 1. Index three docs in the Lite Index:
   // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
   // - Doc1 {"foot", "fool" is_in_prefix_section=true}
   // - Doc2 {"fool", "far" is_in_prefix_section=false}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));

   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

   Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   std::vector<DocHitInfo> hits =
       GetExactHits(main_index.get(), /*term_start_index=*/0,
                    /*unnormalized_term_length=*/0, "foot");
   EXPECT_THAT(hits, IsEmpty());
   hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
                        /*unnormalized_term_length=*/0, "fo");
   EXPECT_THAT(hits, IsEmpty());

   // 3. Merge the index. The main index should contain "fool", "foot"
   // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
   // should not be present because it is not a branch point.
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // Get hits from an exact posting list.
   hits = GetExactHits(main_index.get(), /*term_start_index=*/0,
                       /*unnormalized_term_length=*/0, "foot");
   // We should get hits for "foot" in doc1 and doc0
   EXPECT_THAT(
       hits,
       ElementsAre(
           EqualsDocHitInfo(doc1_hit.document_id(),
                            std::vector<SectionId>{doc1_hit.section_id()}),
           EqualsDocHitInfo(doc0_hit.document_id(),
                            std::vector<SectionId>{doc0_hit.section_id()})));

   // Get hits from a branching point posting list. "fo" should redirect to "foo"
   hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
                        /*unnormalized_term_length=*/0, "fo");
   // We should get hits for "foot" in doc1 and "fool" in doc1. We shouldn't get
   // the hits for "foot" in doc0 and "fool" in doc0 and doc2 because they
   // weren't hits in prefix sections.
   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
                         doc1_hit.document_id(),
                         std::vector<SectionId>{doc1_hit.section_id()})));
 }

 TEST_F(MainIndexTest, MergeIndexToPreexisting) {
   // 1. Index three docs in the Lite Index:
   // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
   // - Doc1 {"foot", "fool" is_in_prefix_section=true}
   // - Doc2 {"fool", "far" is_in_prefix_section=false}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));

   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

   Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the index. The main index should contain "fool", "foot"
   // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
   // should not be present because it is not a branch point.
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));

   // 4. Index two docs in a new Lite Index:
   // - Doc3 {"foot", "four", "foul", "fall" is_in_prefix_section=false}
   // - Doc4 {"four", "foul" is_in_prefix_section=true}
   std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
   LiteIndex::Options options(lite_index_file_name2,
                              /*hit_buffer_want_merge_bytes=*/1024 * 1024,
                              /*hit_buffer_sort_at_indexing=*/true,
                              /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
   ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
                              LiteIndex::Create(options, &icing_filesystem_));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi, lite_index_->InsertTerm("four", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t four_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi, lite_index_->InsertTerm("foul", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foul_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("fall", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit));

   Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit));
   ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit));

   // 3. Merge the index. The main index should now contain "foul", "four"
   // and "fall", a branch points for "fou" and backfill points for "fo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // Get hits from an exact posting list the existed before the merge.
   std::vector<DocHitInfo> hits =
       GetExactHits(main_index.get(), /*term_start_index=*/0,
                    /*unnormalized_term_length=*/0, "foot");

   // We should get hits for "foot" in doc3, doc1 and doc0
   EXPECT_THAT(
       hits,
       ElementsAre(
           EqualsDocHitInfo(doc3_hit.document_id(),
                            std::vector<SectionId>{doc3_hit.section_id()}),
           EqualsDocHitInfo(doc1_hit.document_id(),
                            std::vector<SectionId>{doc1_hit.section_id()}),
           EqualsDocHitInfo(doc0_hit.document_id(),
                            std::vector<SectionId>{doc0_hit.section_id()})));
   // Get hits from backfill posting list.
   hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
                        /*unnormalized_term_length=*/0, "fo");
   // We should get hits for "four" and "foul" in doc4 and hits for "foot" and
   // "fool" in doc1. We shouldn't get the hits for "foot" in doc0 and doc3,
   // "fool" in doc0 and doc2 or the hits for "four" and "foul" in doc4 because
   // they weren't hits in prefix sections.
   EXPECT_THAT(
       hits,
       ElementsAre(
           EqualsDocHitInfo(doc4_hit.document_id(),
                            std::vector<SectionId>{doc4_hit.section_id()}),
           EqualsDocHitInfo(doc1_hit.document_id(),
                            std::vector<SectionId>{doc1_hit.section_id()})));
 }

 TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) {
   // 1. Index two docs in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=true}
   // - Doc1 {"foo" is_in_prefix_section=false}
   // - Doc2 {"foot" is_in_prefix_section=false}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));

   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));

   Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
   // "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   std::vector<DocHitInfo> hits =
       GetPrefixHits(main_index.get(), /*term_start_index=*/0,
                     /*unnormalized_term_length=*/0, "foo");
   // We should get hits for "foo" in doc1 and doc0, but not in doc2 because it
   // is not a prefix hit.
   EXPECT_THAT(
       hits,
       ElementsAre(
           EqualsDocHitInfo(doc1_hit.document_id(),
                            std::vector<SectionId>{doc1_hit.section_id()}),
           EqualsDocHitInfo(doc0_hit.document_id(),
                            std::vector<SectionId>{doc0_hit.section_id()})));
 }

 TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) {
   // 1. Index two docs in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=true}
   // - Doc1 {"foo" is_in_prefix_section=false}
   // - Doc1 {"foo" is_in_prefix_section=true}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi, lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));

   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));

   Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
   // "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   std::vector<DocHitInfo> hits =
       GetExactHits(main_index.get(), /*term_start_index=*/0,
                    /*unnormalized_term_length=*/0, "foo");

   // We should get hits for "foo" in doc2 and doc1, but not in doc0 because it
   // is not an exact hit.
   EXPECT_THAT(
       hits,
       ElementsAre(
           EqualsDocHitInfo(doc2_hit.document_id(),
                            std::vector<SectionId>{doc2_hit.section_id()}),
           EqualsDocHitInfo(doc1_hit.document_id(),
                            std::vector<SectionId>{doc1_hit.section_id()})));
 }

 TEST_F(MainIndexTest,
        SearchChainedPostingListsShouldMergeSectionsAndTermFrequency) {
   // Index 2048 document with 3 hits in each document. When merged into the main
   // index, this will 1) lead to a chained posting list and 2) split at least
   // one document's hits across multiple posting lists.
   const std::string term = "foot";

   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm(term, TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   for (DocumentId document_id = 0; document_id < 2048; ++document_id) {
     Hit::TermFrequency term_frequency = static_cast<Hit::TermFrequency>(
         document_id % Hit::kMaxTermFrequency + 1);
     Hit doc_hit0(
         /*section_id=*/0, /*document_id=*/document_id, term_frequency,
         /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
     ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0));

     Hit doc_hit1(
         /*section_id=*/1, /*document_id=*/document_id, term_frequency,
         /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
     ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1));

     Hit doc_hit2(
         /*section_id=*/2, /*document_id=*/document_id, term_frequency,
         /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
     ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2));
   }

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the lite index.
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // Get hits for all documents containing "foot" - which should be all of them.

   auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
       main_index.get(), term, /*term_start_index=*/0,
       /*unnormalized_term_length=*/0, kSectionIdMaskAll,
       /*need_hit_term_frequency=*/true);

   DocumentId expected_document_id = 2047;
   while (iterator->Advance().ok()) {
     EXPECT_THAT(iterator->doc_hit_info(),
                 EqualsDocHitInfo(expected_document_id,
                                  std::vector<SectionId>{0, 1, 2}));

     std::vector<TermMatchInfo> matched_terms_stats;
     iterator->PopulateMatchedTermsStats(&matched_terms_stats);

     Hit::TermFrequency expected_term_frequency =
         static_cast<Hit::TermFrequency>(
             expected_document_id % Hit::kMaxTermFrequency + 1);
     ASSERT_THAT(matched_terms_stats, SizeIs(1));
     EXPECT_THAT(matched_terms_stats[0].term, Eq(term));
     EXPECT_THAT(matched_terms_stats[0].term_frequencies[0],
                 Eq(expected_term_frequency));
     EXPECT_THAT(matched_terms_stats[0].term_frequencies[1],
                 Eq(expected_term_frequency));
     EXPECT_THAT(matched_terms_stats[0].term_frequencies[2],
                 Eq(expected_term_frequency));
     --expected_document_id;
   }
   EXPECT_THAT(expected_document_id, Eq(-1));
 }

 TEST_F(MainIndexTest, MergeIndexBackfilling) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"fool" is_in_prefix_section=true}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));

   // 2. Create the main index. It should have no entries in its lexicon.
   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   // 3. Merge the index. The main index should contain "fool".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));

   // 4. Index two docs in a new Lite Index:
   // - Doc1 {"foot" is_in_prefix_section=false}
   std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
   LiteIndex::Options options(lite_index_file_name2,
                              /*hit_buffer_want_merge_bytes=*/1024 * 1024,
                              /*hit_buffer_sort_at_indexing=*/true,
                              /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
   ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
                              LiteIndex::Create(options, &icing_filesystem_));
   ICING_ASSERT_OK_AND_ASSIGN(
       tvi,
       lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));

   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));

   // 5. Merge the index. The main index should now contain "fool", "foot"
   // and a backfill point for "foo".
   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   // Get hits from an exact posting list the existed before the merge.
   std::vector<DocHitInfo> hits =
       GetExactHits(main_index.get(), /*term_start_index=*/0,
                    /*unnormalized_term_length=*/0, "foo");
   EXPECT_THAT(hits, IsEmpty());

   // Get hits from backfill posting list.
   hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
                        /*unnormalized_term_length=*/0, "foo");
   // We should get a hit for "fool" in doc0.
   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
                         doc0_hit.document_id(),
                         std::vector<SectionId>{doc0_hit.section_id()})));
 }

 TEST_F(MainIndexTest, OneHitInTheFirstPageForTwoPagesMainIndex) {
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t tvi,
       lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
                              term_id_codec_->EncodeTvi(tvi, TviType::LITE));
   SectionId section_id = 0;
   // Based on debugging logs, 2038 documents in the following setting will
   // result in two pages in the posting list chain, and the first page only
   // contains one hit.
   uint32_t num_docs = 2038;
   for (DocumentId document_id = 0; document_id < num_docs; ++document_id) {
     Hit doc_hit(section_id, document_id, Hit::kDefaultTermFrequency,
                 /*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
     ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
   }

   std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
   ICING_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MainIndex> main_index,
       MainIndex::Create(main_index_file_name, &filesystem_,
                         &icing_filesystem_));

   ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
   std::vector<DocHitInfo> hits =
       GetExactHits(main_index.get(), /*term_start_index=*/0,
                    /*unnormalized_term_length=*/0, "foo");
   ASSERT_THAT(hits, SizeIs(num_docs));
   for (DocumentId document_id = num_docs - 1; document_id >= 0; --document_id) {
     ASSERT_THAT(
         hits[num_docs - 1 - document_id],
         EqualsDocHitInfo(document_id, std::vector<SectionId>{section_id}));
   }
 }

 }  // namespace

 }  // namespace lib
 }  // namespace icing