blob: 37e14fc58fd18617f88122a871212e59a8612f9d [file] [log] [blame]
// Copyright (C) 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "icing/index/main/main-index-merger.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/filesystem.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/index/main/doc-hit-info-iterator-term-main.h"
#include "icing/index/main/main-index-merger.h"
#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-property-id.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/schema/section.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
namespace icing {
namespace lib {
namespace {
using ::testing::UnorderedElementsAre;
class MainIndexMergerTest : public testing::Test {
protected:
void SetUp() override {
index_dir_ = GetTestTempDir() + "/test_dir";
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
LiteIndex::Options options(lite_index_file_name,
/*hit_buffer_want_merge_bytes=*/1024 * 1024,
/*hit_buffer_sort_at_indexing=*/true,
/*hit_buffer_sort_threshold_bytes=*/1024 * 8);
ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
LiteIndex::Create(options, &icing_filesystem_));
ICING_ASSERT_OK_AND_ASSIGN(
term_id_codec_,
TermIdCodec::Create(
IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
IcingDynamicTrie::max_value_index(options.lexicon_options)));
}
void TearDown() override {
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
}
std::string index_dir_;
Filesystem filesystem_;
IcingFilesystem icing_filesystem_;
std::unique_ptr<LiteIndex> lite_index_;
std::unique_ptr<TermIdCodec> term_id_codec_;
};
constexpr NamespaceId kNamespace0 = 0;
TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
// 1. Index two docs in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=FALSE}
// - Doc1 {"fool", is_in_prefix_section=FALSE}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_term_id,
term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_tvi,
lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_term_id,
term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
/*is_in_prefix_section=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
// 2. Build up a fake LexiconMergeOutputs
// This is some made up number that doesn't matter for this test.
uint32_t foot_main_tvi = 5;
// Only create a mapping for 'foot'. Leave out the mapping for 'fool'
MainIndex::LexiconMergeOutputs lexicon_outputs;
lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
// 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
// mapping.
ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
*lite_index_, *term_id_codec_, lexicon_outputs),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
TEST_F(MainIndexMergerTest, PrefixExpansion) {
// 1. Index two docs in the Lite Index:
// - Doc0 {"foot" is_in_prefix_section=FALSE}
// - Doc1 {"fool", is_in_prefix_section=TRUE}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_term_id,
term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_tvi,
lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_term_id,
term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
/*is_in_prefix_section=*/false);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
// 2. Build up a fake LexiconMergeOutputs
// This is some made up number that doesn't matter for this test.
uint32_t foo_main_tvi = 12;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1,
Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
uint32_t foot_main_tvi = 5;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_main_term_id,
term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
uint32_t fool_main_tvi = 10;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_main_term_id,
term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
MainIndex::LexiconMergeOutputs lexicon_outputs;
// Map "fool" to it's prefix hit for "foo".
lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
std::make_pair(0, 1));
lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
// 3. TranslateAndExpand should;
// a. Translate lite term ids to main term ids based on the map
// b. Expand 'fool' to have a hit for 'foo'
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
lexicon_outputs));
EXPECT_THAT(
expanded_term_id_hit_pairs,
UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
TermIdHitPair(fool_main_term_id, doc1_hit),
TermIdHitPair(foo_term_id, doc1_prefix_hit)));
}
TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_term_id,
term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_tvi,
lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
// 2. Build up a fake LexiconMergeOutputs
// This is some made up number that doesn't matter for this test.
uint32_t foo_main_tvi = 12;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_main_term_id,
term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
// The prefix hit for 'foot' should have the same term frequency as the exact
// hit for 'foot'. The final prefix hit has term frequency equal to 58.
Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
/*term_frequency=*/58,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
uint32_t foot_main_tvi = 5;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_main_term_id,
term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
MainIndex::LexiconMergeOutputs lexicon_outputs;
// Map "foot" to it's prefix hit for "foo".
lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
std::make_pair(0, 1));
lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
// 3. TranslateAndExpand should;
// a. Translate lite term ids to main term ids based on the map
// b. Expand 'foot' to have a hit for 'foo'
// c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
// latter with term frequency as the sum of the term frequencies.
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
lexicon_outputs));
EXPECT_THAT(
expanded_term_id_hit_pairs,
UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
TermIdHitPair(foo_main_term_id, foo_doc0_hit),
TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
}
TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_term_id,
term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_tvi,
lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
// The prefix hit should take the sum as term_frequency - 114.
Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
/*term_frequency=*/114,
/*is_in_prefix_section=*/true,
/*is_prefix_hit=*/true);
// 2. Build up a fake LexiconMergeOutputs
// This is some made up number that doesn't matter for this test.
uint32_t foo_main_tvi = 12;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_main_term_id,
term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
uint32_t foot_main_tvi = 5;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_main_term_id,
term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
MainIndex::LexiconMergeOutputs lexicon_outputs;
// Map "foot" to it's prefix hit for "foo".
lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
std::make_pair(0, 1));
lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
// 3. TranslateAndExpand should;
// a. Translate lite term ids to main term ids based on the map
// b. Expand 'foot' to have a hit for 'foo'
// c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
// latter with term frequency as the sum of the term frequencies.
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
lexicon_outputs));
EXPECT_THAT(expanded_term_id_hit_pairs,
UnorderedElementsAre(
TermIdHitPair(foot_main_term_id, foot_doc0_hit),
TermIdHitPair(foo_main_term_id, foo_doc0_hit),
TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
}
TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
// 1. Index one doc in the Lite Index:
// - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_tvi,
lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_term_id,
term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_tvi,
lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_term_id,
term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
/*term_frequency=*/Hit::kMaxTermFrequency,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true);
ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
// 2. Build up a fake LexiconMergeOutputs
// This is some made up number that doesn't matter for this test.
uint32_t foo_main_tvi = 12;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foo_term_id,
term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
// The prefix hit should take the sum as term frequency - 256, capped at
// kMaxTermFrequency.
Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
/*term_frequency=*/Hit::kMaxTermFrequency,
/*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
uint32_t foot_main_tvi = 5;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t foot_main_term_id,
term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
uint32_t fool_main_tvi = 10;
ICING_ASSERT_OK_AND_ASSIGN(
uint32_t fool_main_term_id,
term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
MainIndex::LexiconMergeOutputs lexicon_outputs;
// Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
// "foo".
lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
std::make_pair(0, 1));
lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
std::make_pair(1, 1));
lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
// 3. TranslateAndExpand should;
// a. Translate lite term ids to main term ids based on the map
// b. Expand 'foot' and 'fool' to have hits for 'foo'
// c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
// term frequency.
ICING_ASSERT_OK_AND_ASSIGN(
std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
lexicon_outputs));
EXPECT_THAT(
expanded_term_id_hit_pairs,
UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
TermIdHitPair(fool_main_term_id, fool_doc0_hit),
TermIdHitPair(foo_term_id, doc0_prefix_hit)));
}
} // namespace
} // namespace lib
} // namespace icing