icing/index/main/main-index-merger_test.cc - platform/external/icing - Git at Google

 // Copyright (C) 2019 Google LLC
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "icing/index/main/main-index-merger.h"

 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include "icing/absl_ports/canonical_errors.h"
 #include "icing/file/filesystem.h"
 #include "icing/index/iterator/doc-hit-info-iterator.h"
 #include "icing/index/main/doc-hit-info-iterator-term-main.h"
 #include "icing/index/main/main-index-merger.h"
 #include "icing/index/main/main-index.h"
 #include "icing/index/term-id-codec.h"
 #include "icing/index/term-property-id.h"
 #include "icing/legacy/index/icing-dynamic-trie.h"
 #include "icing/legacy/index/icing-filesystem.h"
 #include "icing/schema/section.h"
 #include "icing/store/namespace-id.h"
 #include "icing/testing/common-matchers.h"
 #include "icing/testing/tmp-directory.h"

 namespace icing {
 namespace lib {

 namespace {

 using ::testing::UnorderedElementsAre;

 class MainIndexMergerTest : public testing::Test {
  protected:
   void SetUp() override {
     index_dir_ = GetTestTempDir() + "/test_dir";
     ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));

     std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
     LiteIndex::Options options(lite_index_file_name,
                                /*hit_buffer_want_merge_bytes=*/1024 * 1024,
                                /*hit_buffer_sort_at_indexing=*/true,
                                /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
     ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
                                LiteIndex::Create(options, &icing_filesystem_));

     ICING_ASSERT_OK_AND_ASSIGN(
         term_id_codec_,
         TermIdCodec::Create(
             IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
             IcingDynamicTrie::max_value_index(options.lexicon_options)));
   }

   void TearDown() override {
     ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
   }

   std::string index_dir_;
   Filesystem filesystem_;
   IcingFilesystem icing_filesystem_;
   std::unique_ptr<LiteIndex> lite_index_;
   std::unique_ptr<TermIdCodec> term_id_codec_;
 };

 constexpr NamespaceId kNamespace0 = 0;

 TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
   // 1. Index two docs in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=FALSE}
   // - Doc1 {"fool", is_in_prefix_section=FALSE}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_term_id,
       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_tvi,
       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_term_id,
       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
                /*is_in_prefix_section=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

   // 2. Build up a fake LexiconMergeOutputs
   // This is some made up number that doesn't matter for this test.
   uint32_t foot_main_tvi = 5;

   // Only create a mapping for 'foot'. Leave out the mapping for 'fool'
   MainIndex::LexiconMergeOutputs lexicon_outputs;
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);

   // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
   // mapping.
   ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
                   *lite_index_, *term_id_codec_, lexicon_outputs),
               StatusIs(libtextclassifier3::StatusCode::INTERNAL));
 }

 TEST_F(MainIndexMergerTest, PrefixExpansion) {
   // 1. Index two docs in the Lite Index:
   // - Doc0 {"foot" is_in_prefix_section=FALSE}
   // - Doc1 {"fool", is_in_prefix_section=TRUE}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_term_id,
       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_tvi,
       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_term_id,
       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

   Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
                /*is_in_prefix_section=*/false);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
   Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
                /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

   // 2. Build up a fake LexiconMergeOutputs
   // This is some made up number that doesn't matter for this test.
   uint32_t foo_main_tvi = 12;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_term_id,
       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
   Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1,
                       Hit::kDefaultTermFrequency,
                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);

   uint32_t foot_main_tvi = 5;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_main_term_id,
       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
   uint32_t fool_main_tvi = 10;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_main_term_id,
       term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));

   MainIndex::LexiconMergeOutputs lexicon_outputs;
   // Map "fool" to it's prefix hit for "foo".
   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
                                                         std::make_pair(0, 1));
   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);

   // 3. TranslateAndExpand should;
   //   a. Translate lite term ids to main term ids based on the map
   //   b. Expand 'fool' to have a hit for 'foo'
   ICING_ASSERT_OK_AND_ASSIGN(
       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
                                                   lexicon_outputs));
   EXPECT_THAT(
       expanded_term_id_hit_pairs,
       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
                            TermIdHitPair(fool_main_term_id, doc1_hit),
                            TermIdHitPair(foo_term_id, doc1_prefix_hit)));
 }

 TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_term_id,
       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_tvi,
       lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
                              term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));

   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
                     /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
   Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
                    Hit::kDefaultTermFrequency,
                    /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));

   // 2. Build up a fake LexiconMergeOutputs
   // This is some made up number that doesn't matter for this test.
   uint32_t foo_main_tvi = 12;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_main_term_id,
       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
   // The prefix hit for 'foot' should have the same term frequency as the exact
   // hit for 'foot'. The final prefix hit has term frequency equal to 58.
   Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
                       /*term_frequency=*/58,
                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);

   uint32_t foot_main_tvi = 5;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_main_term_id,
       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));

   MainIndex::LexiconMergeOutputs lexicon_outputs;
   // Map "foot" to it's prefix hit for "foo".
   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
                                                         std::make_pair(0, 1));
   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);

   // 3. TranslateAndExpand should;
   //   a. Translate lite term ids to main term ids based on the map
   //   b. Expand 'foot' to have a hit for 'foo'
   //   c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
   //   latter with term frequency as the sum of the term frequencies.
   ICING_ASSERT_OK_AND_ASSIGN(
       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
                                                   lexicon_outputs));
   EXPECT_THAT(
       expanded_term_id_hit_pairs,
       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
                            TermIdHitPair(foo_main_term_id, foo_doc0_hit),
                            TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
 }

 TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_term_id,
       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_tvi,
       lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
                              term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));

   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
                     /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
   Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
                    /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
   // The prefix hit should take the sum as term_frequency - 114.
   Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
                           /*term_frequency=*/114,
                           /*is_in_prefix_section=*/true,
                           /*is_prefix_hit=*/true);

   // 2. Build up a fake LexiconMergeOutputs
   // This is some made up number that doesn't matter for this test.
   uint32_t foo_main_tvi = 12;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_main_term_id,
       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));

   uint32_t foot_main_tvi = 5;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_main_term_id,
       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));

   MainIndex::LexiconMergeOutputs lexicon_outputs;
   // Map "foot" to it's prefix hit for "foo".
   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
                                                         std::make_pair(0, 1));
   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);

   // 3. TranslateAndExpand should;
   //   a. Translate lite term ids to main term ids based on the map
   //   b. Expand 'foot' to have a hit for 'foo'
   //   c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
   //   latter with term frequency as the sum of the term frequencies.
   ICING_ASSERT_OK_AND_ASSIGN(
       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
                                                   lexicon_outputs));
   EXPECT_THAT(expanded_term_id_hit_pairs,
               UnorderedElementsAre(
                   TermIdHitPair(foot_main_term_id, foot_doc0_hit),
                   TermIdHitPair(foo_main_term_id, foo_doc0_hit),
                   TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
 }

 TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
   // 1. Index one doc in the Lite Index:
   // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_tvi,
       lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_term_id,
       term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_tvi,
       lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_term_id,
       term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

   Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
                     /*term_frequency=*/Hit::kMaxTermFrequency,
                     /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
   Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
                     Hit::kDefaultTermFrequency,
                     /*is_in_prefix_section=*/true);
   ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));

   // 2. Build up a fake LexiconMergeOutputs
   // This is some made up number that doesn't matter for this test.
   uint32_t foo_main_tvi = 12;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foo_term_id,
       term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
   // The prefix hit should take the sum as term frequency - 256, capped at
   // kMaxTermFrequency.
   Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
                       /*term_frequency=*/Hit::kMaxTermFrequency,
                       /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);

   uint32_t foot_main_tvi = 5;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t foot_main_term_id,
       term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
   uint32_t fool_main_tvi = 10;
   ICING_ASSERT_OK_AND_ASSIGN(
       uint32_t fool_main_term_id,
       term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));

   MainIndex::LexiconMergeOutputs lexicon_outputs;
   // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
   // "foo".
   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
                                                         std::make_pair(0, 1));
   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
   lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
                                                         std::make_pair(1, 1));
   lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
   lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);

   // 3. TranslateAndExpand should;
   //   a. Translate lite term ids to main term ids based on the map
   //   b. Expand 'foot' and 'fool' to have hits for 'foo'
   //   c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
   //      term frequency.
   ICING_ASSERT_OK_AND_ASSIGN(
       std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
       MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
                                                   lexicon_outputs));
   EXPECT_THAT(
       expanded_term_id_hit_pairs,
       UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
                            TermIdHitPair(fool_main_term_id, fool_doc0_hit),
                            TermIdHitPair(foo_term_id, doc0_prefix_hit)));
 }

 }  // namespace

 }  // namespace lib
 }  // namespace icing
	// Copyright (C) 2019 Google LLC
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	#include "icing/index/main/main-index-merger.h"

	#include "gmock/gmock.h"
	#include "gtest/gtest.h"
	#include "icing/absl_ports/canonical_errors.h"
	#include "icing/file/filesystem.h"
	#include "icing/index/iterator/doc-hit-info-iterator.h"
	#include "icing/index/main/doc-hit-info-iterator-term-main.h"
	#include "icing/index/main/main-index-merger.h"
	#include "icing/index/main/main-index.h"
	#include "icing/index/term-id-codec.h"
	#include "icing/index/term-property-id.h"
	#include "icing/legacy/index/icing-dynamic-trie.h"
	#include "icing/legacy/index/icing-filesystem.h"
	#include "icing/schema/section.h"
	#include "icing/store/namespace-id.h"
	#include "icing/testing/common-matchers.h"
	#include "icing/testing/tmp-directory.h"

	namespace icing {
	namespace lib {

	namespace {

	using ::testing::UnorderedElementsAre;

	class MainIndexMergerTest : public testing::Test {
	protected:
	void SetUp() override {
	index_dir_ = GetTestTempDir() + "/test_dir";
	ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));

	std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
	LiteIndex::Options options(lite_index_file_name,
	/hit_buffer_want_merge_bytes=/1024 * 1024,
	/hit_buffer_sort_at_indexing=/true,
	/hit_buffer_sort_threshold_bytes=/1024 * 8);
	ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
	LiteIndex::Create(options, &icing_filesystem_));

	ICING_ASSERT_OK_AND_ASSIGN(
	term_id_codec_,
	TermIdCodec::Create(
	IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
	IcingDynamicTrie::max_value_index(options.lexicon_options)));
	}

	void TearDown() override {
	ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
	}

	std::string index_dir_;
	Filesystem filesystem_;
	IcingFilesystem icing_filesystem_;
	std::unique_ptr<LiteIndex> lite_index_;
	std::unique_ptr<TermIdCodec> term_id_codec_;
	};

	constexpr NamespaceId kNamespace0 = 0;

	TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
	// 1. Index two docs in the Lite Index:
	// - Doc0 {"foot" is_in_prefix_section=FALSE}
	// - Doc1 {"fool", is_in_prefix_section=FALSE}
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_tvi,
	lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_term_id,
	term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_tvi,
	lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_term_id,
	term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

	Hit doc0_hit(/section_id=/0, /document_id=/0, /term_frequency=/57,
	/is_in_prefix_section=/false);
	ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
	Hit doc1_hit(/section_id=/0, /document_id=/1, Hit::kDefaultTermFrequency,
	/is_in_prefix_section=/false);
	ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

	// 2. Build up a fake LexiconMergeOutputs
	// This is some made up number that doesn't matter for this test.
	uint32_t foot_main_tvi = 5;

	// Only create a mapping for 'foot'. Leave out the mapping for 'fool'
	MainIndex::LexiconMergeOutputs lexicon_outputs;
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);

	// 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
	// mapping.
	ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
	lite_index_, term_id_codec_, lexicon_outputs),
	StatusIs(libtextclassifier3::StatusCode::INTERNAL));
	}

	TEST_F(MainIndexMergerTest, PrefixExpansion) {
	// 1. Index two docs in the Lite Index:
	// - Doc0 {"foot" is_in_prefix_section=FALSE}
	// - Doc1 {"fool", is_in_prefix_section=TRUE}
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_tvi,
	lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_term_id,
	term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_tvi,
	lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_term_id,
	term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

	Hit doc0_hit(/section_id=/0, /document_id=/0, /term_frequency=/57,
	/is_in_prefix_section=/false);
	ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
	Hit doc1_hit(/section_id=/0, /document_id=/1, Hit::kDefaultTermFrequency,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));

	// 2. Build up a fake LexiconMergeOutputs
	// This is some made up number that doesn't matter for this test.
	uint32_t foo_main_tvi = 12;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_term_id,
	term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
	Hit doc1_prefix_hit(/section_id=/0, /document_id=/1,
	Hit::kDefaultTermFrequency,
	/is_in_prefix_section=/true, /is_prefix_hit=/true);

	uint32_t foot_main_tvi = 5;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_main_term_id,
	term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
	uint32_t fool_main_tvi = 10;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_main_term_id,
	term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));

	MainIndex::LexiconMergeOutputs lexicon_outputs;
	// Map "fool" to it's prefix hit for "foo".
	lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
	std::make_pair(0, 1));
	lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);

	// 3. TranslateAndExpand should;
	// a. Translate lite term ids to main term ids based on the map
	// b. Expand 'fool' to have a hit for 'foo'
	ICING_ASSERT_OK_AND_ASSIGN(
	std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
	MainIndexMerger::TranslateAndExpandLiteHits(lite_index_, term_id_codec_,
	lexicon_outputs));
	EXPECT_THAT(
	expanded_term_id_hit_pairs,
	UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
	TermIdHitPair(fool_main_term_id, doc1_hit),
	TermIdHitPair(foo_term_id, doc1_prefix_hit)));
	}

	TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
	// 1. Index one doc in the Lite Index:
	// - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_tvi,
	lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_term_id,
	term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_tvi,
	lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
	term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));

	Hit foot_doc0_hit(/section_id=/0, /document_id=/0, /term_frequency=/57,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
	Hit foo_doc0_hit(/section_id=/0, /document_id=/0,
	Hit::kDefaultTermFrequency,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));

	// 2. Build up a fake LexiconMergeOutputs
	// This is some made up number that doesn't matter for this test.
	uint32_t foo_main_tvi = 12;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_main_term_id,
	term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
	// The prefix hit for 'foot' should have the same term frequency as the exact
	// hit for 'foot'. The final prefix hit has term frequency equal to 58.
	Hit doc0_prefix_hit(/section_id=/0, /document_id=/0,
	/term_frequency=/58,
	/is_in_prefix_section=/true, /is_prefix_hit=/true);

	uint32_t foot_main_tvi = 5;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_main_term_id,
	term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));

	MainIndex::LexiconMergeOutputs lexicon_outputs;
	// Map "foot" to it's prefix hit for "foo".
	lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
	std::make_pair(0, 1));
	lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);

	// 3. TranslateAndExpand should;
	// a. Translate lite term ids to main term ids based on the map
	// b. Expand 'foot' to have a hit for 'foo'
	// c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
	// latter with term frequency as the sum of the term frequencies.
	ICING_ASSERT_OK_AND_ASSIGN(
	std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
	MainIndexMerger::TranslateAndExpandLiteHits(lite_index_, term_id_codec_,
	lexicon_outputs));
	EXPECT_THAT(
	expanded_term_id_hit_pairs,
	UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
	TermIdHitPair(foo_main_term_id, foo_doc0_hit),
	TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
	}

	TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
	// 1. Index one doc in the Lite Index:
	// - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_tvi,
	lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_term_id,
	term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_tvi,
	lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
	term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));

	Hit foot_doc0_hit(/section_id=/0, /document_id=/0, /term_frequency=/57,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
	Hit foo_doc0_hit(/section_id=/0, /document_id=/0, /term_frequency=/57,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
	// The prefix hit should take the sum as term_frequency - 114.
	Hit prefix_foo_doc0_hit(/section_id=/0, /document_id=/0,
	/term_frequency=/114,
	/is_in_prefix_section=/true,
	/is_prefix_hit=/true);

	// 2. Build up a fake LexiconMergeOutputs
	// This is some made up number that doesn't matter for this test.
	uint32_t foo_main_tvi = 12;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_main_term_id,
	term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));

	uint32_t foot_main_tvi = 5;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_main_term_id,
	term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));

	MainIndex::LexiconMergeOutputs lexicon_outputs;
	// Map "foot" to it's prefix hit for "foo".
	lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
	std::make_pair(0, 1));
	lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);

	// 3. TranslateAndExpand should;
	// a. Translate lite term ids to main term ids based on the map
	// b. Expand 'foot' to have a hit for 'foo'
	// c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
	// latter with term frequency as the sum of the term frequencies.
	ICING_ASSERT_OK_AND_ASSIGN(
	std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
	MainIndexMerger::TranslateAndExpandLiteHits(lite_index_, term_id_codec_,
	lexicon_outputs));
	EXPECT_THAT(expanded_term_id_hit_pairs,
	UnorderedElementsAre(
	TermIdHitPair(foot_main_term_id, foot_doc0_hit),
	TermIdHitPair(foo_main_term_id, foo_doc0_hit),
	TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
	}

	TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
	// 1. Index one doc in the Lite Index:
	// - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_tvi,
	lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_term_id,
	term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_tvi,
	lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_term_id,
	term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));

	Hit foot_doc0_hit(/section_id=/0, /document_id=/0,
	/term_frequency=/Hit::kMaxTermFrequency,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
	Hit fool_doc0_hit(/section_id=/0, /document_id=/0,
	Hit::kDefaultTermFrequency,
	/is_in_prefix_section=/true);
	ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));

	// 2. Build up a fake LexiconMergeOutputs
	// This is some made up number that doesn't matter for this test.
	uint32_t foo_main_tvi = 12;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foo_term_id,
	term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
	// The prefix hit should take the sum as term frequency - 256, capped at
	// kMaxTermFrequency.
	Hit doc0_prefix_hit(/section_id=/0, /document_id=/0,
	/term_frequency=/Hit::kMaxTermFrequency,
	/is_in_prefix_section=/true, /is_prefix_hit=/true);

	uint32_t foot_main_tvi = 5;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t foot_main_term_id,
	term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
	uint32_t fool_main_tvi = 10;
	ICING_ASSERT_OK_AND_ASSIGN(
	uint32_t fool_main_term_id,
	term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));

	MainIndex::LexiconMergeOutputs lexicon_outputs;
	// Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
	// "foo".
	lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
	std::make_pair(0, 1));
	lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
	lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
	std::make_pair(1, 1));
	lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
	lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);

	// 3. TranslateAndExpand should;
	// a. Translate lite term ids to main term ids based on the map
	// b. Expand 'foot' and 'fool' to have hits for 'foo'
	// c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
	// term frequency.
	ICING_ASSERT_OK_AND_ASSIGN(
	std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
	MainIndexMerger::TranslateAndExpandLiteHits(lite_index_, term_id_codec_,
	lexicon_outputs));
	EXPECT_THAT(
	expanded_term_id_hit_pairs,
	UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
	TermIdHitPair(fool_main_term_id, fool_doc0_hit),
	TermIdHitPair(foo_term_id, doc0_prefix_hit)));
	}

	} // namespace

	} // namespace lib
	} // namespace icing